In [1]:
import os
import time
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
from tqdm import tqdm

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
train_transformer = transforms.Compose([transforms.Resize((224,224)),transforms.RandomApply([
        transforms.CenterCrop((180, 180)),
        transforms.Resize((224, 224)),
        transforms.RandomRotation(20, fill=(0,)),
        transforms.RandomHorizontalFlip(),
        ],0.7)
  ])

validation_transformer = transforms.Compose([transforms.Resize((224,224)),transforms.RandomApply([
        transforms.CenterCrop((180, 180)),
        transforms.Resize((224, 224)),
        transforms.RandomRotation(20, fill=(0,)),
        transforms.RandomHorizontalFlip(),
        ],0.7)
  ])

In [3]:
# Custom Lungs Dataset that can be constructed into Train, Test, Validation dataset respectively, and select a Dataset (Normal-Infected or Covid-NonCovid) to use, based on the binary classifier implementation.
class LungDataset3C(Dataset):
    def __init__(self, group):

        self.img_size = (150, 150)
        
        self.class_names = ['normal', 'covid', 'non-covid']
        self.classes = {0: 'normal', 1: 'infected_covid', 2: 'infected_non_covid'}
        
        self.groups = [group]
        
        # Number of images in each part of the dataset
        self.dataset_numbers = {'train_normal': 1341,\
                                'train_infected_covid': 1345,\
                                'train_infected_non_covid': 2530,\
                                'val_normal': 8,\
                                'val_infected_covid': 9,\
                                'val_infected_non_covid': 8,\
                                'test_normal': 234,\
                                'test_infected_covid': 139,\
                                'test_infected_non_covid': 242}
        
    def get_dataset_path(self, _class):
        sub_path = None
        group = self.groups[0]
        if _class == self.classes[1]:
            sub_path = os.path.join("infected", "covid")
        elif _class == self.classes[2]:
            sub_path = os.path.join("infected", "non-covid")
        else:
            sub_path = "normal"
        return os.path.join("dataset", group, sub_path)

    def filter_dataset_numbers(self):
        filtered_dataset_numbers_map = dict()
        for key, value in self.dataset_numbers.items():
            if self.groups[0] in key:
                filtered_dataset_numbers_map[key] = value
        return filtered_dataset_numbers_map

    def describe(self):
        filtered_dataset_numbers_map = self.filter_dataset_numbers()
        # Generate description
        msg = "This is the Lung {} Dataset used for the Small Project Demo in the 50.039 Deep Learning class".format(self.groups[0].upper())
        msg += " in Feb-March 2021. \n"
        msg += "It contains a total of {} images, ".format(len(self))
        msg += "of size {} by {}.\n".format(self.img_size[0], self.img_size[1])
        msg += "Images have been split in three groups: training, testing and validation sets.\n"
        msg += "The images are stored in the following locations "
        msg += "and each one contains the following number of images:\n"
        for group in self.groups:
            for _class in self.classes.values():
                label = "{}_{}".format(group, _class)
                path = self.get_dataset_path(_class)
                msg += " - {}, in folder {}: {} images.\n".format(label, path, filtered_dataset_numbers_map[label])
        print(msg)
    
    def open_img(self, _class, index):
        group = self.groups[0]
        if _class not in self.classes.values():
            raise ValueError("Input class not found! Please input: {}. Got: {}".format(list(self.classes.values()), _class))
        max_val = self.dataset_numbers['{}_{}'.format(group, _class)]
        if index < 0 or index >= max_val:
            raise ValueError("Index out of range! Should be (0 ~ {}) but got {}".format(max_val-1, index))
        path_to_file = os.path.join(self.get_dataset_path(_class), "{}.jpg".format(index))
        imgs = []
        with open(path_to_file, 'rb') as f:
            img = Image.open(f)
            if self.groups[0] == "train":
              for _ in range(10):
                img = train_transformer(img)
                image = np.asarray(img) / 255
                image = transforms.functional.to_tensor(np.array(image)).float()
                imgs.append(image)
            else:
              img = validation_transformer(img)
              image = np.asarray(img) / 255
              image = transforms.functional.to_tensor(np.array(image)).float()
              imgs.append(image)
        f.close()
        return imgs
    
    def show_img(self, _class, index):
        # Open image
        im = self.open_img(_class, index)
        
        # Display
        plt.imshow(im)

    def __len__(self):
        length = 0
        for key, item in self.dataset_numbers.items():
            if self.groups[0] in key:
                  length += item
        return length

    def __getitem__(self, index):
        filtered_dataset_numbers_map = self.filter_dataset_numbers()
        first_val = int(list(filtered_dataset_numbers_map.values())[0])
        second_val = int(list(filtered_dataset_numbers_map.values())[1])
        if index < first_val:
            _class = 'normal'
            labels = 0
        elif first_val <= index < first_val + second_val:
            _class = 'infected_covid'
            index = index - first_val
            labels = 1
        else:
            _class = 'infected_non_covid'
            index = index - first_val - second_val
            labels = 2
        imgs = self.open_img(_class, index)
        return imgs, labels

In [4]:
trainset_normal_infected = LungDataset3C(group="train")
testset_nomral_infected = LungDataset3C(group="test")
valset_normal_infected = LungDataset3C(group="val")

train_loader = DataLoader(trainset_normal_infected, batch_size=32, shuffle=True)
test_loader = DataLoader(testset_nomral_infected, batch_size=32, shuffle=True)
val_loader = DataLoader(valset_normal_infected, batch_size=32, shuffle=True)

print(train_loader, test_loader, val_loader)
print(len(train_loader.dataset), len(test_loader.dataset), len(val_loader.dataset))

train_loader.dataset.describe()
test_loader.dataset.describe()
val_loader.dataset.describe()

<torch.utils.data.dataloader.DataLoader object at 0x7f3bec0c9e10> <torch.utils.data.dataloader.DataLoader object at 0x7f3bec0c9ef0> <torch.utils.data.dataloader.DataLoader object at 0x7f3bec0c9f28>
5216 615 25
This is the Lung TRAIN Dataset used for the Small Project Demo in the 50.039 Deep Learning class in Feb-March 2021. 
It contains a total of 5216 images, of size 150 by 150.
Images have been split in three groups: training, testing and validation sets.
The images are stored in the following locations and each one contains the following number of images:
 - train_normal, in folder dataset/train/normal: 1341 images.
 - train_infected_covid, in folder dataset/train/infected/covid: 1345 images.
 - train_infected_non_covid, in folder dataset/train/infected/non-covid: 2530 images.

This is the Lung TEST Dataset used for the Small Project Demo in the 50.039 Deep Learning class in Feb-March 2021. 
It contains a total of 615 images, of size 150 by 150.
Images have been split in three group

In [5]:
class_names = train_loader.dataset.class_names
def show_images(images, labels, preds):
    plt.figure(figsize=(16, 8))
    for i, image in enumerate(images):
        plt.subplot(4, 8, i + 1, xticks=[], yticks=[])
        plt.imshow(np.squeeze(image))
        col = 'green'
        if preds[i] != labels[i]:
            col = 'red'
        plt.rc('axes', labelsize=14)
        plt.xlabel(f'{class_names[int(labels[i].numpy())]}')
        plt.ylabel(f'{class_names[int(preds[i].numpy())]}', color=col)
    plt.tight_layout()
    plt.show()

def show_preds(model):
    model.eval()
    images, labels = next(iter(test_loader))
    images = images[0].to("cuda")
    labels = labels.to("cuda")
    outputs = model(images)
    _, preds = torch.max(outputs, dim=1)
    show_images(images.cpu(), labels.cpu(), preds.cpu())

In [6]:
def write_val_to_csv(val, name):
    
    with open("{}.csv".format(name), "a", encoding="utf-8") as fh:
        fh.write("{}\n".format(val))
    fh.close()


In [7]:
class Model(nn.Module):
    def __init__(self, dropout=0.7):
        super().__init__()

        self.dropout = dropout

        self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2d_2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2d_3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv2d_4 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv2d_5 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.conv2d_6 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.conv2d_7 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.conv2d_8 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.maxPool2d = nn.MaxPool2d(kernel_size=2, stride=2, dilation=1)
        self.linear_1 = nn.Linear(512 * 7 * 7, 4096)
        self.linear_2 = nn.Linear(4096, 4096)
        self.linear_3 = nn.Linear(4096, 1000)
        self.linear_4 = nn.Linear(1000, 3)
        # self.linear_5 = nn.Linear(4096, 3)      # output layer
        self.dropout = nn.Dropout(self.dropout)
        self.adaptiveAvgPool = nn.AdaptiveAvgPool2d(7)

    def forward(self, x:torch.Tensor):
        # 1st Conv
        x = F.relu(self.conv2d_1(x))
        # x = F.relu(self.conv2d_2(x))
        x = self.maxPool2d(x)

        # 2nd Conv
        x = F.relu(self.conv2d_3(x))
        # x = F.relu(self.conv2d_4(x))
        x = self.maxPool2d(x)

        # 3rd Conv
        x = F.relu(self.conv2d_5(x))
        x = F.relu(self.conv2d_6(x))
        # x = F.relu(self.conv2d_6(x))
        x = self.maxPool2d(x)

        # 4th Conv
        x = F.relu(self.conv2d_7(x))
        # x = F.relu(self.conv2d_8(x))
        # x = F.relu(self.conv2d_8(x))
        x = self.maxPool2d(x)

        # 5th Conv
        x = F.relu(self.conv2d_8(x))
        # x = F.relu(self.conv2d_8(x))
        # x = F.relu(self.conv2d_8(x))
        x = self.maxPool2d(x)

        x = self.adaptiveAvgPool(x)

        x = x.view(x.size(0), -1)

        # Classifier
        x = F.relu(self.linear_1(x))
        x = self.dropout(x)
        x = F.relu(self.linear_2(x))
        x = self.dropout(x)
        x = F.relu(self.linear_3(x))
        x = self.dropout(x)
        x = F.relu(self.linear_4(x))
        # x = self.dropout(x)
        # x = self.linear_5(x)
        return x

In [8]:
model = Model()

# Check if trained model existed, if yes load it
if os.path.exists("./small_project_model_2.pth"):
    model.load_state_dict(torch.load("./small_project_model_2.pth"))


model.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5, weight_decay=1e-5)

def save_model(model):
    save_file = 'small_project_model_2.pth'
    torch.save(model.state_dict(), save_file)

def train(epochs):

    n_epochs = epochs

    start = time.time()

    best_accuracy = 0

    for epoch in range(1, n_epochs + 1):

        train_loss = 0
        steps = 0

        # Training
        model.train()
        for datas, target in train_loader:
            for data in datas:
              data, tmp_target = data.cuda(), target.cuda()
              optimizer.zero_grad()
              output = model.forward(data)
              loss = criterion(output, tmp_target)
              loss.backward()
              optimizer.step()

              train_loss += loss.item()

              if steps % 100 == 0:
                accuracy = 0

                # Evaluation
                valid_loss = 0
                model.eval()
                for val_step, (data, val_target) in enumerate(test_loader):
                
                    data, tmp_target = data[0].cuda(), val_target.cuda()
                    val_output = model.forward(data)
                    loss = criterion(val_output, tmp_target)
                    valid_loss += loss.item()

                    _, preds = torch.max(val_output, 1)
                    accuracy += sum((preds == tmp_target).cpu().numpy())

                valid_loss /= (val_step + 1)
                accuracy = accuracy / len(test_loader.dataset)
                train_loss /= (steps + 1)
                
                print("Epoch: {:3}/{:3} Steps: {:4}/{:4} Train Loss: {:.6f} Validation Loss: {:.6f} Accuracy: {:.4f}".format(epoch, n_epochs, steps, len(train_loader)*10, train_loss, valid_loss, accuracy))

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    save_model(model)

                # show predictions plots
                # show_preds(model)

                write_val_to_csv(valid_loss, "valid_loss")
                write_val_to_csv(accuracy, "accuracy")
                write_val_to_csv(train_loss, "train_loss")
                
                model.train()

                if accuracy >= 0.98:
                    print('Performance condition satisfied, stopping..')
                    save_model(model)
                    print("Run time: {:.3f} min".format((time.time() - start)/60))
                    return train_loss_list, validation_loss_list, accuracy_list

              steps += 1          
            
    save_model(model)
    print("Run time: {:.3f} min".format((time.time() - start)/60))
    return accuracy_list

In [9]:
train_loss_list, validation_loss_list, accuracy_list = train(12)

Epoch:   1/ 12 Steps:    0/1630 Train Loss: 0.430239 Validation Loss: 0.578710 Accuracy: 0.8228
Epoch:   1/ 12 Steps:  100/1630 Train Loss: 0.703046 Validation Loss: 1.450348 Accuracy: 0.5593
Epoch:   1/ 12 Steps:  200/1630 Train Loss: 0.381256 Validation Loss: 0.803663 Accuracy: 0.7626
Epoch:   1/ 12 Steps:  300/1630 Train Loss: 0.245084 Validation Loss: 0.686093 Accuracy: 0.7626
Epoch:   1/ 12 Steps:  400/1630 Train Loss: 0.188712 Validation Loss: 0.572547 Accuracy: 0.8163
Epoch:   1/ 12 Steps:  500/1630 Train Loss: 0.150108 Validation Loss: 0.761366 Accuracy: 0.7642
Epoch:   1/ 12 Steps:  600/1630 Train Loss: 0.131382 Validation Loss: 0.645203 Accuracy: 0.8016
Epoch:   1/ 12 Steps:  700/1630 Train Loss: 0.101853 Validation Loss: 0.721362 Accuracy: 0.7967
Epoch:   1/ 12 Steps:  800/1630 Train Loss: 0.091960 Validation Loss: 0.719461 Accuracy: 0.7691
Epoch:   1/ 12 Steps:  900/1630 Train Loss: 0.080508 Validation Loss: 0.848460 Accuracy: 0.7642
Epoch:   1/ 12 Steps: 1000/1630 Train Lo

RuntimeError: CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 8.00 GiB total capacity; 5.90 GiB already allocated; 18.79 MiB free; 6.49 GiB reserved in total by PyTorch)

# Test the image Augmentation

In [None]:
images, labels = train_loader.dataset[0]
labels = [labels for _ in range(len(images))]

plt.figure(figsize=(16, 8))
for i, image in enumerate(images):
    plt.subplot(1, 10, i + 1, xticks=[], yticks=[])
    plt.imshow(np.squeeze(image))
    col = 'green'
    plt.rc('axes', labelsize=14)
    plt.xlabel(f'{class_names[int(labels[i])]}')
plt.tight_layout()
plt.show()

# Read CSV for data recorded

In [None]:
def retrieve_data(csv_path):
    val_list = []
    with open(csv_path, "r", encoding="utf-8") as fh:
        lines = fh.readlines()
        for line in lines:
            line = line.strip("\n")
            if len(line) > 0:
                val_list.append(float(line))
    return val_list

# Plot Graph for Accuracy

In [None]:
# Plotting accuracy
import matplotlib.pyplot as plt

accuracy_list = retrieve_data("accuracy.csv")
steps = len(accuracy_list)
plt.plot(np.arange(1, steps+1,1),accuracy_list[:steps], label='accuracy')
plt.xticks(range(1,steps+1,2))
plt.xlim(1,steps+1)
plt.xlabel('steps (20 batches)')
plt.ylabel('accuracy')
plt.title('Accuracy')
plt.legend(loc='upper right')
filename = 'accuracy.png'
plt.savefig(filename)
plt.show()

# Plot Graph for Train, Validation Loss

In [None]:
import matplotlib.pyplot as plt

train_loss_list = retrieve_data("train_loss.csv")
validation_loss_list = retrieve_data("valid_loss.csv")
steps = len(validation_loss_list)
plt.plot(np.arange(1, steps+1,1),train_loss_list[:steps], label='average train loss')
plt.plot(np.arange(1,steps+1,1), validation_loss_list[:steps], label='average validation loss')
plt.xticks(range(1,steps+1,2))
plt.xlim(1,steps+1)
plt.xlabel('steps (20 batches)')
plt.ylabel('running losses')
plt.title('loss reduction')
plt.legend(loc='upper right')
filename = 'loss_profile.png'
plt.savefig(filename)
plt.show()