# Preprocessing and Model Training

In [None]:
from pneumonia_detector.preprocess import XrayDataset
from pneumonia_detector.model import PneumoniaClassifier
from pneumonia_detector.training_run import train_model
import os
import torch
import torch.nn as nn 
import numpy as np
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchvision import transforms

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# set a random state value for reproducibility
torch.manual_seed(55)

In [None]:
training_dir = "/workspaces/chest_xray_challenge/data/chest_xray/train/"
validation_dir = "/workspaces/chest_xray_challenge/data/chest_xray/val/"

In [None]:
# training transforms
train_transforms = transforms.Compose([transforms.RandomRotation(20),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomResizedCrop(size=(256, 256), scale=(0.8, 1.0)),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
        transforms.RandomApply([transforms.RandomAffine(0, translate=(0.1, 0.1))], p=0.5),
        transforms.RandomApply([transforms.RandomPerspective(distortion_scale=0.2)], p=0.5),
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.4823, 0.4823, 0.4823], std=[0.2363, 0.2363, 0.2363]),
])

In [None]:
# Create training set Dataset object
xray_train_data = XrayDataset(root_dir=training_dir, transform=train_transforms)
len(xray_train_data)

In [None]:
# Function to create weighted sampler based on class imbalance.
def create_weighted_sampler(dataset):
    targets = [XrayDataset.label_map[file.split(os.sep)[-2].lower()] for file in dataset.files]
    class_counts = np.bincount(targets)
    class_weights = 1.0 / class_counts
    weights = [class_weights[label] for label in targets]
    sampler = WeightedRandomSampler(weights, len(weights))
    return sampler

In [None]:
# Create weighted sampler
sampler = create_weighted_sampler(xray_train_data)

In [None]:
# Let's take a look at some stats for the preprocessed training set
print(torch.mean(xray_train_data[0][0], dim=[1,2], keepdim=True))
print(torch.std(xray_train_data[0][0], dim=[1,2], keepdim=True))
print(torch.min(xray_train_data[0][0]))
print(torch.max(xray_train_data[0][0]))

In [None]:
# Create a corresponding Dataset object for the validation set to allow validation during training
xray_val_data = XrayDataset(root_dir=validation_dir, transform=train_transforms)
len(xray_val_data)

Create training and validation DataLoader objects

In [None]:
train_dataloader_xray = DataLoader(
                                dataset=xray_train_data,
                                batch_size=16,
                                num_workers=0,
                                sampler=sampler,
                                )

In [None]:
val_dataloader_xray = DataLoader(
                                dataset=xray_val_data,
                                batch_size=16,
                                num_workers=0,
                                shuffle=True
                                )

In [None]:
# set the device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Instantiate the model, loss function and optimizer
model = PneumoniaClassifier().to(device)
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.0005, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# set params and run the training
model_dir = "/workspaces/chest_xray_challenge/models/"
model_filename = "nb_test.pt"
batch_size = 16
n_epochs = 2
patience = 5
image_size = 256
learning_rate = 0.001

model, train_loss, valid_loss = train_model(model_dir, model_filename, training_dir, validation_dir, batch_size, patience, n_epochs, image_size, learning_rate)