In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


# Data Loading

In [2]:
data_dir = '/kaggle/input/breast-histopathology-images'
image_size = 224  
batch_size = 32


 Load All Images

In [3]:
import os

def load_all_images(base_path, limit=None):
    image_paths = []
    labels = []

    for patient_id in os.listdir(base_path):
        patient_path = os.path.join(base_path, patient_id)

        
        if not os.path.isdir(patient_path):
            continue
        
        for label_folder in ['0', '1']:  # 0 = Benign, 1 = Malignant
            label_path = os.path.join(patient_path, label_folder)
            if not os.path.exists(label_path):
                continue

            
            for file in os.listdir(label_path):
                if file.endswith('.png'):
                    image_paths.append(os.path.join(label_path, file))
                    labels.append(int(label_folder))
    
    if limit:
        image_paths = image_paths[:limit]
        labels = labels[:limit]

    return image_paths, labels


base_path = '/kaggle/input/breast-histopathology-images'
image_paths, labels = load_all_images(base_path, limit=20000)

print("Total images loaded:", len(image_paths))
print("Sample image path:", image_paths[0])
print("Label (0=Benign, 1=Malignant):", labels[0])


Total images loaded: 20000
Sample image path: /kaggle/input/breast-histopathology-images/10295/0/10295_idx5_x1351_y1101_class0.png
Label (0=Benign, 1=Malignant): 0


# Image Pre-processing and and Training

In [4]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms

# Resize images to 64x64
image_size = 64
batch_size = 32

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

class HistopathologyDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        return image, label

# Split into train and validation
from sklearn.model_selection import train_test_split

train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, stratify=labels, random_state=42)

train_dataset = HistopathologyDataset(train_paths, train_labels, transform=transform)
val_dataset = HistopathologyDataset(val_paths, val_labels, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


# Modeling

In [5]:
import torch.nn as nn
import torch

class EnhancedCNN(nn.Module):
    def __init__(self):
        super(EnhancedCNN, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(128 * (image_size // 8) * (image_size // 8), 128), nn.ReLU(),
            nn.Dropout(0.5),  # Dropout layer with 50% probability
            nn.Linear(128, 2)  # 2 classes (Benign, Malignant)
        )

    def forward(self, x):
        return self.net(x)

# Assuming image_size is defined earlier, otherwise set it
image_size = 64  # Adjust as per your input image size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize and move model to device
model = EnhancedCNN().to(device)


# Datasets Transformation

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader


train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, stratify=labels, random_state=42)


train_dataset = HistopathologyDataset(train_paths, train_labels, transform=transform)
val_dataset = HistopathologyDataset(val_paths, val_labels, transform=transform)

# Create DataLoader for train and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


Training 

In [8]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

# Initialize optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=5, gamma=0.7)

# Training function
def train(model, loader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Accuracy calculation
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    
    accuracy = correct / total
    return total_loss / len(loader), accuracy

# Validation function
def validate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    return accuracy


# Epochs

In [9]:
num_epochs = 20  # You can increase this for better training

for epoch in range(num_epochs):
    # Train the model
    train_loss, train_accuracy = train(model, train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    
    # Validate the model
    val_accuracy = validate(model, val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}")
    
    # Update the learning rate
    scheduler.step()


Epoch 1/20, Train Loss: 0.3816, Train Accuracy: 0.8336
Epoch 1/20, Validation Accuracy: 0.8615
Epoch 2/20, Train Loss: 0.3280, Train Accuracy: 0.8619
Epoch 2/20, Validation Accuracy: 0.8327
Epoch 3/20, Train Loss: 0.3047, Train Accuracy: 0.8729
Epoch 3/20, Validation Accuracy: 0.8842
Epoch 4/20, Train Loss: 0.2890, Train Accuracy: 0.8788
Epoch 4/20, Validation Accuracy: 0.8898
Epoch 5/20, Train Loss: 0.2728, Train Accuracy: 0.8874
Epoch 5/20, Validation Accuracy: 0.8802
Epoch 6/20, Train Loss: 0.2443, Train Accuracy: 0.8972
Epoch 6/20, Validation Accuracy: 0.8922
Epoch 7/20, Train Loss: 0.2294, Train Accuracy: 0.9036
Epoch 7/20, Validation Accuracy: 0.8875
Epoch 8/20, Train Loss: 0.2160, Train Accuracy: 0.9102
Epoch 8/20, Validation Accuracy: 0.8855
Epoch 9/20, Train Loss: 0.1984, Train Accuracy: 0.9175
Epoch 9/20, Validation Accuracy: 0.8928
Epoch 10/20, Train Loss: 0.1749, Train Accuracy: 0.9274
Epoch 10/20, Validation Accuracy: 0.8900
Epoch 11/20, Train Loss: 0.1481, Train Accuracy:

Prevent Overfitting

In [10]:
# Evaluate the model on the validation set after training
final_accuracy = validate(model, val_loader)
print(f"Final Validation Accuracy: {final_accuracy:.4f}")


Final Validation Accuracy: 0.8905


In [11]:
torch.save(model.state_dict(), 'breast_cancer_model.pth')
