
## Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split
import torchvision.models as models
from sklearn.model_selection import train_test_split
import wandb
import coloredlogs
import logging

  from .autonotebook import tqdm as notebook_tqdm


## Identify running device,chose GPU if available

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Data Preprocessing

In [3]:
# Define data transformations including augmentation
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

### Train/Test/Val Split

In [4]:
# Load entire dataset using ImageFolder
full_dataset = ImageFolder(root='Histology_Dataset/Train')

# Split dataset into train, validation, and test sets
train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size

train_dataset, temp_dataset = random_split(full_dataset, [train_size, len(full_dataset) - train_size])
val_dataset, test_dataset = random_split(temp_dataset, [val_size, test_size])

# Apply transformations to datasets
train_dataset.dataset.transform = train_transform
val_dataset.dataset.transform = val_transform
test_dataset.dataset.transform = val_transform

# Create data loaders for train, validation, and test datasets
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


## Model Definition

In [5]:
torch.manual_seed(42)
def weights_init(m):
    if isinstance(m, nn.Conv2d):
        torch.nn.init.xavier_uniform(m.weight.data)

In [6]:
class HistologyClassifier(nn.Module):
    def __init__(self, num_classes):
        super(HistologyClassifier, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256 * 28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

## Pre-Trained Model (Resnet)

In [7]:
# Load pre-trained ResNet model
resnet = models.resnet18(pretrained=True)
num_features = resnet.fc.in_features

# Modify the classifier of ResNet
resnet.fc = nn.Sequential(
    nn.Linear(num_features, 512),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(512, 4)  # Assuming 4 classes
)

# Add Batch Normalization
model_resnet = nn.Sequential(
    resnet,
    nn.BatchNorm1d(4)  # Applying BatchNorm to the output of the classifier
)




## Define Train/Test Functions


In [8]:
# Configure the logger
coloredlogs.install(level=logging.INFO)
logger = logging.getLogger(__name__)

In [9]:
def train_model(model, train_loader, valid_loader, device, project_name, exp_name, num_epochs=10):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    lr = 0.001
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # Track metrics
    training_metrics = {}
    wandb.init(project=project_name,name=exp_name,
        config={
            "lr":lr,
            'optimizer':'adam',
            "num_epoch":num_epochs})
    # Training
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0
        size_train_loader = len(train_loader)
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # Calculate Train Accuracy
            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
        train_loss = running_loss/size_train_loader
        train_acc = running_corrects.double() / size_train_loader
        # Validation
        model.eval()
        with torch.no_grad():
            valid_loss = 0.0
            running_corrects = 0
            # total = 0
            size_val_loader = len(val_loader)
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                valid_loss += loss.item() * inputs.size(0)
                _, preds = outputs.max(1)
                running_corrects += torch.sum(preds == labels.data)
            val_loss = valid_loss / size_val_loader
            val_acc = running_corrects.double() / size_val_loader
        metrics = {
                    "train_loss": train_loss,
                    "train_acc": train_acc,
                    "val_loss": val_loss,
                    "val_acc": val_acc,
                    "epoch":epoch}
        wandb.log({
                "train_loss": metrics['train_loss'],
                "train_acc": metrics['train_acc'].item(),
                "val_loss": metrics['val_loss'],
                "val_acc": metrics['val_acc'].item(),
            })
        logger.info(
                    f"Epoch [{epoch+1}/{num_epochs}] - "
                    f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
                    f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}"
                )
    return {
        "model": model,
        "training_metrics":training_metrics
    }



In [10]:
model_histology_custom = HistologyClassifier(num_classes=4)
model_histology_custom.apply(weights_init)
train_result = train_model(model_histology_custom, train_loader, val_loader, device, project_name="HistologyClassifier", exp_name="custom_model",num_epochs=50)

  torch.nn.init.xavier_uniform(m.weight.data)
[32m2023-09-01 12:44:38[0m [35mAntonWindows[0m [34mwandb.jupyter[1464][0m [1;30mERROR[0m [31mFailed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.[0m
[34m[1mwandb[0m: Currently logged in as: [33mfurkanayik[0m. Use [1m`wandb login --relogin`[0m to force relogin


[32m2023-09-01 12:45:36[0m [35mAntonWindows[0m [34m__main__[1464][0m [1;30mINFO[0m Epoch [1/50] - Train Loss: 157.0298, Train Acc: 8.2500, Val Loss: 44.6952, Val Acc: 6.0000
[32m2023-09-01 12:45:45[0m [35mAntonWindows[0m [34m__main__[1464][0m [1;30mINFO[0m Epoch [2/50] - Train Loss: 44.5655, Train Acc: 7.1250, Val Loss: 44.2925, Val Acc: 8.0000
[32m2023-09-01 12:45:53[0m [35mAntonWindows[0m [34m__main__[1464][0m [1;30mINFO[0m Epoch [3/50] - Train Loss: 44.4521, Train Acc: 7.2500, Val Loss: 44.3904, Val Acc: 3.0000
[32m2023-09-01 12:46:02[0m [35mAntonWindows[0m [34m__main__[1464][0m [1;30mINFO[0m Epoch [4/50] - Train Loss: 44.4205, Train Acc: 6.5000, Val Loss: 44.3618, Val Acc: 6.0000
[32m2023-09-01 12:46:10[0m [35mAntonWindows[0m [34m__main__[1464][0m [1;30mINFO[0m Epoch [5/50] - Train Loss: 44.6604, Train Acc: 9.0000, Val Loss: 44.4825, Val Acc: 7.0000
[32m2023-09-01 12:46:19[0m [35mAntonWindows[0m [34m__main__[1464][0m [1;30mINFO[0m Epo

{'model': HistologyClassifier(
   (features): Sequential(
     (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (1): ReLU()
     (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (4): ReLU()
     (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (7): ReLU()
     (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   )
   (classifier): Sequential(
     (0): Dropout(p=0.5, inplace=False)
     (1): Linear(in_features=200704, out_features=512, bias=True)
     (2): ReLU()
     (3): Dropout(p=0.5, inplace=False)
     (4): Linear(in_features=512, out_features=4, bias=True)
   )
 ),
 'training_metrics': {}}

In [16]:
def test_model(model):
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
        avg_test_loss = test_loss/len(test_loader)
        acc_test = correct/total
        print(f"Test Loss: {avg_test_loss:.4f}, "
            f"Test Accuracy: {100*acc_test:.2f}%")
        return avg_test_loss, acc_test


In [12]:
train_model(model_resnet, train_loader, val_loader, device, 
    project_name="Histology_Classification", 
    exp_name="resnet_train",
    num_epochs=50)

[32m2023-09-01 04:24:47[0m [35mAntonWindows[0m [34m__main__[26428][0m [1;30mINFO[0m Epoch [1/50] - Train Loss: 42.5706, Train Acc: 12.8750, Val Loss: 69.1040, Val Acc: 1000.0000
[32m2023-09-01 04:24:57[0m [35mAntonWindows[0m [34m__main__[26428][0m [1;30mINFO[0m Epoch [2/50] - Train Loss: 35.5436, Train Acc: 17.2500, Val Loss: 82.5637, Val Acc: 900.0000
[32m2023-09-01 04:25:06[0m [35mAntonWindows[0m [34m__main__[26428][0m [1;30mINFO[0m Epoch [3/50] - Train Loss: 33.8744, Train Acc: 18.8750, Val Loss: 104.1375, Val Acc: 1000.0000
[32m2023-09-01 04:25:14[0m [35mAntonWindows[0m [34m__main__[26428][0m [1;30mINFO[0m Epoch [4/50] - Train Loss: 29.7875, Train Acc: 22.2500, Val Loss: 137.4967, Val Acc: 700.0000
[32m2023-09-01 04:25:22[0m [35mAntonWindows[0m [34m__main__[26428][0m [1;30mINFO[0m Epoch [5/50] - Train Loss: 28.8408, Train Acc: 21.3750, Val Loss: 38.0188, Val Acc: 1500.0000
[32m2023-09-01 04:25:31[0m [35mAntonWindows[0m [34m__main__[26428

{'model': Sequential(
   (0): ResNet(
     (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
     (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     (relu): ReLU(inplace=True)
     (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
     (layer1): Sequential(
       (0): BasicBlock(
         (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
         (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
         (relu): ReLU(inplace=True)
         (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
         (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       )
       (1): BasicBlock(
         (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
         (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1,

## Train Custom Model

## What would you do to improve the performance of the classifier ?

To improve the performance of the histology image classifier, you can consider several strategies. Keep in mind that experimentation and tuning are crucial in achieving better results. Here are some approaches you can take:

1. **Data Augmentation:** Apply various data augmentation techniques to increase the diversity of your training dataset. Common augmentations include random rotations, flips, shifts, brightness adjustments, and zooms. This helps the model generalize better to different variations of the same image.

2. **Transfer Learning:** Utilize pre-trained models like ResNet, VGG, or Inception, which are trained on large datasets like ImageNet. Fine-tune these models on your histology dataset. Transfer learning can significantly boost performance, as the models have already learned useful features from a diverse range of images.

3. **Learning Rate Scheduling:** Adjust the learning rate during training. Start with a larger learning rate and then gradually reduce it as training progresses. This can help the model converge faster and prevent overshooting.

4. **Model Architecture:** Experiment with different CNN architectures, layer depths, kernel sizes, and the number of filters. More complex architectures might capture finer features but could also lead to overfitting.

5. **Regularization:** Implement regularization techniques like dropout and weight decay to prevent overfitting. These techniques help the model generalize better to unseen data.

6. **Batch Normalization:** Add batch normalization layers to normalize the activations of each layer, which can speed up training and improve convergence.

7. **Optimizer Choice:** Besides Adam, experiment with other optimizers like SGD with momentum or RMSprop. Different optimizers can have varying effects on convergence speed and generalization.

8. **Hyperparameter Tuning:** Systematically search for optimal hyperparameters, such as learning rate, dropout rate, batch size, and others. Tools like random search or grid search can be used for this purpose.

9. **Ensemble Methods:** Train multiple models with different initializations or architectures and combine their predictions to make final decisions. Ensemble methods can help reduce model variance and improve overall performance.

10. **Class Imbalance Handling:** If your classes are imbalanced, apply techniques like class weighting or oversampling the minority classes to prevent the model from being biased towards the majority class.

11. **Regular Monitoring:** Continuously monitor training and validation curves. If the validation loss starts increasing while training loss is decreasing, it might indicate overfitting. You might need to stop training or adjust regularization.

12. **Model Interpretability:** Utilize techniques like Grad-CAM to understand which regions of the image are influencing the model's decisions. This can help identify whether the model is focusing on relevant regions.

Remember, there's no one-size-fits-all solution, and the effectiveness of these strategies can vary based on your specific dataset and problem. It's recommended to experiment with a combination of these techniques to find the best approach for your histology image classifier.