In [None]:
## Setup Kaggle & Download Dataset

from google.colab import files
import os, shutil, glob

files.upload()  # Upload kaggle.json
os.makedirs("/root/.kaggle", exist_ok=True)
shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 600)

import kagglehub
path = kagglehub.dataset_download("ambarish/breakhis")

# Copy dataset to content folder
dataset_images_path = os.path.join(path, "BreaKHis_v1", "BreaKHis_v1", "histology_slides", "breast")
destination = "/content/breakhis_dataset"
if not os.path.exists(destination):
    shutil.copytree(dataset_images_path, destination)

# Dataset overview
benign_images = glob.glob(os.path.join(destination, "benign", "**", "*.png"), recursive=True)
malignant_images = glob.glob(os.path.join(destination, "malignant", "**", "*.png"), recursive=True)
magnifications = ["40X", "100X", "200X", "400X"]
image_counts = {mag: len(glob.glob(os.path.join(destination, "**", mag, "*.png"), recursive=True)) for mag in magnifications}


## Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

plt.figure(figsize=(6, 4))
sns.barplot(x=["Benign", "Malignant"], y=[len(benign_images), len(malignant_images)])
plt.title("Class Distribution")
plt.ylabel("Number of Images")
plt.show()

plt.figure(figsize=(6, 4))
sns.barplot(x=list(image_counts.keys()), y=list(image_counts.values()))
plt.title("Image Count by Magnification")
plt.ylabel("Number of Images")
plt.xlabel("Magnification")
plt.show()

fig, axes = plt.subplots(2, 4, figsize=(14, 6))
for i, mag in enumerate(magnifications):
    benign_sample = glob.glob(os.path.join(destination, "benign", "**", mag, "*.png"), recursive=True)[0]
    malignant_sample = glob.glob(os.path.join(destination, "malignant", "**", mag, "*.png"), recursive=True)[0]
    axes[0, i].imshow(Image.open(benign_sample))
    axes[0, i].set_title(f"Benign - {mag}")
    axes[0, i].axis("off")
    axes[1, i].imshow(Image.open(malignant_sample))
    axes[1, i].set_title(f"Malignant - {mag}")
    axes[1, i].axis("off")
plt.suptitle("Sample Images by Class and Magnification", fontsize=16)
plt.tight_layout()
plt.show()

## Data Preprocessing

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

IMG_SIZE = 224
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

data = []
for class_name in ["benign", "malignant"]:
    class_path = os.path.join(dataset, class_name, "SOB")
    for subtype in os.listdir(class_path):
        subtype_path = os.path.join(class_path, subtype)
        for sample in os.listdir(subtype_path):
            for mag in magnifications:
                mag_path = os.path.join(subtype_path, sample, mag)
                for img_path in glob.glob(os.path.join(mag_path, "*.png")):
                    data.append([img_path, class_name, mag])

df = pd.DataFrame(data, columns=["image_path", "label", "magnification"])
df["label"] = df["label"].map({"benign": 0, "malignant": 1})

class BreakHisDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]["image_path"]
        label = self.dataframe.iloc[idx]["label"]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

## Train-Test Split

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
train_loader = DataLoader(BreakHisDataset(train_df, transform=transform), batch_size=32, shuffle=True)
test_loader = DataLoader(BreakHisDataset(test_df, transform=transform), batch_size=32)

## Train ResNet18

import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from tqdm import tqdm

model = models.resnet18(weights="ResNet18_Weights.DEFAULT")
model.fc = nn.Linear(model.fc.in_features, 1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2)

for epoch in range(15):
    model.train()
    train_loss = 0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device, dtype=torch.float32)
        optimizer.zero_grad()
        outputs = model(images).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch+1}, Train Loss: {train_loss / len(train_loader):.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            outputs = model(images).squeeze()
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    print(f"Validation Loss: {val_loss / len(test_loader):.4f}")
    scheduler.step(val_loss)

## Evaluate & Save Model
from sklearn.metrics import accuracy_score

model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device, dtype=torch.float32)
        outputs = model(images).squeeze()
        preds = torch.sigmoid(outputs) > 0.5
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

acc = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {acc * 100:.2f}%")

torch.save(model.state_dict(), "breakhis_resnet18.pth")


## Hyperparameter Tuning with Optuna
import optuna

def objective(trial):
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "sgd"])
    model = models.resnet18(weights="ResNet18_Weights.DEFAULT")
    model.fc = nn.Linear(model.fc.in_features, 1)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr) if optimizer_name == "adam" else optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(3):
        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            optimizer.zero_grad()
            outputs = model(images).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float32)
            outputs = model(images).squeeze()
            preds = torch.sigmoid(outputs) > 0.5
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
print("Best parameters:", study.best_params)
