# Setup

In [None]:
import setup
import os

# Dataset Downloading

Covid-19 Image Classification

In [None]:
#!curl -L -o /content/sample_data/covid19-image-dataset.zip\
#  https://www.kaggle.com/api/v1/datasets/download/pranavraikokte/covid19-image-dataset

In [None]:
#from zipfile import ZipFile
#with ZipFile('/content/sample_data/covid19-image-dataset.zip') as zp:
#  zp.extractall('/content/sample_data/')

# Data Loading

In [None]:
# Checking Biggest Image Sizes
from glob import glob
from setup import DATA_PATH
import os
from PIL import Image
imgs = glob(os.path.join(DATA_PATH, 'train', '*', '*.jpg'), recursive=True) + glob(os.path.join(DATA_PATH, 'train', '*', '*.png'), recursive=True) + glob(os.path.join(DATA_PATH, 'train', '*', '*.jpeg'), recursive=True)

widths = []
heights = []

for img in imgs:
    pil_image = Image.open(img)
    widths.append(pil_image.width)
    heights.append(pil_image.height)

print(f'Largest: width {max(widths)} and height {max(heights)}')
print(f'Mean values: width {sum(widths)/len(widths)} and height {sum(heights)/len(heights)}')

In [None]:
from torchvision import datasets, transforms
from setup import IMAGE_WIDTH, IMAGE_HEIGHT
from data.loading import load_data
from math import ceil

transform = transforms.Compose([transforms.Resize(size=(IMAGE_WIDTH, IMAGE_HEIGHT)), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
train_ds, test_ds = load_data(transform)


We might not have to create a custom dataset, because we can use ImageFolder to create a dataset from an image folder with an organized structure. It will automatically associate the class names to images according to folder names.

https://debuggercafe.com/pytorch-imagefolder-for-training-cnn-models/

Images loaded are of different sizes, which would cause a
```
RuntimeError: stack expects each tensor to be equal size, but got [3, 3480, 4248] at entry 0 and [3, 1303, 1458] at entry 1

```

We can fix that by assuring all images are of the same size. Use a transform for that.

https://discuss.pytorch.org/t/runtimeerror-stack-expects-each-tensor-to-be-equal-size-but-got-3-224-224-at-entry-0-and-3-224-336-at-entry-3/87211

Apparently, we can normalize the channels to make sure brighter colors won't be of more importance. Use a transform for that.

https://stats.stackexchange.com/questions/211436/why-normalize-images-by-subtracting-datasets-image-mean-instead-of-the-current

# Analyzing Data

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.DataFrame(train_ds.imgs)
df = df.rename(columns={0: 'Img_Uri', 1: 'Class'})
classes = df.loc[:, 'Class']

plt.hist(classes)
plt.xticks([0, 1, 2], labels=train_ds.classes)
plt.show()

df.groupby(['Class']).count()

# Modifying Data

-> Undersampling

In [None]:
len(train_ds)

In [None]:
i = 0
iteration = 0
new_imgs = []
targets = []
samples = []
for img in train_ds.imgs:
  if img[1] == 0:
    if i >= 70:
      continue
    i += 1

  targets.append(train_ds.targets[iteration])
  new_imgs.append(img)
  samples.append(train_ds.samples[iteration])
  iteration += 1

train_ds.imgs = new_imgs
train_ds.targets = targets
train_ds.samples = samples

In [None]:
NUM_IMGS = len(train_ds.imgs)
NUM_BATCHES = ceil(NUM_IMGS/setup.BATCH_SIZE)

In [None]:
len(train_ds)

## Loader

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
from setup import BATCH_SIZE


loader = DataLoader(
    dataset=train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)

iterable_ds = iter(loader)

# Declaring Components

- Model
- Optimizer
- Loss Function
- Scheduler

## Making Model

In [None]:
from torch import nn
from setup import FINE_TUNE
from model.load_model import create_model

model = create_model(fine_tune=FINE_TUNE)

## Setting Up Optimizer, Loss Function and Scheduler

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=setup.LEARNING_RATE, momentum=0.9)
loss = nn.CrossEntropyLoss()

# Setting Up Scheduler
from torch.optim import lr_scheduler

scheduler = lr_scheduler.StepLR(
    optimizer,
    step_size=10,
    gamma=0.5
)

# Training

Apparently, to run on the GPU, we have to send the data itself to the GPU. We can do so with the batches. The loss function also needs the labels to be loaded on the GPU.

https://discuss.pytorch.org/t/how-to-load-all-data-into-gpu-for-training/27609/22?page=2

Also, we need the weights to be on the GPU. We can do that by using the model itself, sending it to the GPU as well.

https://discuss.pytorch.org/t/how-to-load-all-data-into-gpu-for-training/27609/34?page=2

In [None]:
# GPU or CPU Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device=device)

Tqdm is a simple library that can be used to generate a loading bar on loops.

https://github.com/tqdm/tqdm

In [None]:
from model.train import train

train(
    model=model,
    num_batches=NUM_BATCHES,
    num_epochs=setup.NUM_EPOCHS,
    device=device,
    optimizer=optimizer,
    scheduler=scheduler,
    dataset=train_ds,
    loss=loss,
    num_imgs=NUM_IMGS
)

# Evaluation

Evaluating with Test Data

In [None]:
from data.evaluation import evaluate_model

#evaluate_model(model=model,
#               test_data=test_ds,
#               batch_size=BATCH_SIZE
#               )

In [None]:
from data.evaluation import evaluate_single

evaluate_single(model=model, classes_list=['Covid', 'Normal', 'Viral Pneumonia'], device=device)

# Saving Model

In [None]:
from model.save_model import save_model

save_model(model=model, optimizer=optimizer)