# Histopathologic Cancer Detection 
**Optimized for CPU/Local Training**

This notebook is configured to run on your local machine with the dataset on C: drive.

**Key Differences from Kaggle Version:**
- CPU-optimized (no GPU/CUDA required)
- Smaller batch sizes for memory efficiency
- Fewer epochs for faster iteration
- Works with local data paths

**Author**: gittaqui  
**GitHub**: https://github.com/gittaqui/WK_3_CNN_Cancer_Detection

In [None]:
# Setup - Force CPU usage to avoid CUDA errors
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Disable GPU
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'   # Reduce TensorFlow warnings

import numpy as np
import pandas as pd
import random
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB0, MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report

# Verify CPU mode
print(f"TensorFlow: {tf.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU'))}")
print(f"Running on: {'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'}")

# Set random seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("\nâœ“ Setup complete - Running in CPU mode")

## 1. Configuration - CPU Optimized

In [None]:
# Paths - Local data on C: drive
BASE_PATH = Path('C:/kaggle_data/cancer_detection')
TRAIN_DIR = BASE_PATH / 'train'
TEST_DIR = BASE_PATH / 'test'
TRAIN_LABELS = BASE_PATH / 'train_labels.csv'

# Verify paths
print("Checking data paths...")
print(f"Base path exists: {BASE_PATH.exists()}")
print(f"Train dir exists: {TRAIN_DIR.exists()}")
print(f"Test dir exists: {TEST_DIR.exists()}")
print(f"Labels file exists: {TRAIN_LABELS.exists()}")

# CPU-Optimized Configuration
CONFIG = {
    # Image settings
    'IMG_SIZE': 96,
    'BATCH_SIZE': 32,  # Smaller for CPU
    'CHANNELS': 3,
    
    # Training settings
    'EPOCHS': 15,  # Fewer epochs for local testing
    'LEARNING_RATE': 1e-3,
    'VALIDATION_SPLIT': 0.2,
    
    # Model settings
    'ARCHITECTURE': 'MobileNetV2',  # Lighter model for CPU
    'DROPOUT_RATE': 0.3,
    'DENSE_UNITS': 128,  # Smaller for CPU
    
    # Augmentation
    'USE_AUGMENTATION': True,
    'TTA_STEPS': 3,  # Fewer TTA steps for speed
    
    # Optimization
    'EARLY_STOPPING_PATIENCE': 4,
    'REDUCE_LR_PATIENCE': 2,
    'REDUCE_LR_FACTOR': 0.5,
    
    # Sampling (for quick tests)
    'USE_SUBSET': False,  # Set True for quick testing
    'SUBSET_SIZE': 10000,  # Use smaller dataset for testing
}

print("\nConfiguration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Save directory
SAVE_DIR = Path('D:/MS_in_AI/WK3_CNN_Detection/WK_3_CNN_Cancer_Detection/models')
SAVE_DIR.mkdir(exist_ok=True)
print(f"\nModels will be saved to: {SAVE_DIR}")

## 2. Load Data

In [None]:
# Load training labels
train_df = pd.read_csv(TRAIN_LABELS)
print(f"Total training samples: {len(train_df):,}")

# Use subset if configured
if CONFIG['USE_SUBSET']:
    print(f"\nUsing subset of {CONFIG['SUBSET_SIZE']:,} samples for quick testing")
    train_df = train_df.sample(n=CONFIG['SUBSET_SIZE'], random_state=SEED).reset_index(drop=True)
    print(f"Subset size: {len(train_df):,}")

# Add filename
train_df['filename'] = train_df['id'].apply(lambda x: f'{x}.tif')

# Class distribution
print(f"\nClass distribution:")
class_counts = train_df['label'].value_counts().sort_index()
print(class_counts)
print(f"\nClass balance:")
print(train_df['label'].value_counts(normalize=True) * 100)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
class_counts.plot(kind='bar', ax=axes[0], color=['green', 'red'], alpha=0.7)
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class (0=Benign, 1=Cancer)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Benign (0)', 'Cancer (1)'], rotation=0)
for i, v in enumerate(class_counts):
    axes[0].text(i, v + 1000, f'{v:,}', ha='center', fontweight='bold')

# Sample images
sample_images = train_df.groupby('label').sample(n=3, random_state=SEED)
axes[1].axis('off')
axes[1].set_title('Sample Images by Class', fontsize=14, fontweight='bold')

# Create subplot grid for samples
fig2, axes2 = plt.subplots(2, 3, figsize=(12, 8))
for idx, (_, row) in enumerate(sample_images.iterrows()):
    img_path = TRAIN_DIR / row['filename']
    if img_path.exists():
        img = Image.open(img_path)
        ax = axes2[idx // 3, idx % 3]
        ax.imshow(img)
        ax.set_title(f"Class: {row['label']} ({'Cancer' if row['label'] == 1 else 'Benign'})", 
                     fontsize=11, fontweight='bold')
        ax.axis('off')

plt.tight_layout()
plt.show()

display(train_df.head(10))

## 3. Train/Validation Split

In [None]:
# Stratified split
train_data, val_data = train_test_split(
    train_df,
    test_size=CONFIG['VALIDATION_SPLIT'],
    stratify=train_df['label'],
    random_state=SEED
)

print(f"Training samples: {len(train_data):,}")
print(f"Validation samples: {len(val_data):,}")

print(f"\nTraining class distribution:")
print(train_data['label'].value_counts().sort_index())
print(f"\nValidation class distribution:")
print(val_data['label'].value_counts().sort_index())

# Calculate class weights for imbalanced data
from sklearn.utils.class_weight import compute_class_weight

class_weights_array = compute_class_weight(
    'balanced',
    classes=np.unique(train_data['label']),
    y=train_data['label']
)
class_weights = dict(enumerate(class_weights_array))

print(f"\nClass weights (for balanced training):")
print(f"  Class 0 (Benign): {class_weights[0]:.3f}")
print(f"  Class 1 (Cancer): {class_weights[1]:.3f}")

## 4. Data Generators with Augmentation

In [None]:
# Training generator with augmentation
if CONFIG['USE_AUGMENTATION']:
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=180,
        width_shift_range=0.15,
        height_shift_range=0.15,
        shear_range=0.15,
        zoom_range=0.15,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='reflect',
        brightness_range=[0.85, 1.15]
    )
else:
    train_datagen = ImageDataGenerator(rescale=1./255)

# Validation generator (no augmentation)
val_datagen = ImageDataGenerator(rescale=1./255)

# Create generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_data,
    directory=str(TRAIN_DIR),
    x_col='filename',
    y_col='label',
    target_size=(CONFIG['IMG_SIZE'], CONFIG['IMG_SIZE']),
    batch_size=CONFIG['BATCH_SIZE'],
    class_mode='binary',
    shuffle=True,
    seed=SEED
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_data,
    directory=str(TRAIN_DIR),
    x_col='filename',
    y_col='label',
    target_size=(CONFIG['IMG_SIZE'], CONFIG['IMG_SIZE']),
    batch_size=CONFIG['BATCH_SIZE'],
    class_mode='binary',
    shuffle=False
)

print(f"Training batches per epoch: {len(train_generator)}")
print(f"Validation batches: {len(val_generator)}")
print(f"\nAugmentation enabled: {CONFIG['USE_AUGMENTATION']}")

## 5. Build Model - CPU Optimized

In [None]:
def build_model(architecture='MobileNetV2'):
    """
    Build lightweight model optimized for CPU training
    """
    input_shape = (CONFIG['IMG_SIZE'], CONFIG['IMG_SIZE'], CONFIG['CHANNELS'])
    
    # Choose base model (CPU-friendly)
    if architecture == 'MobileNetV2':
        base_model = MobileNetV2(
            include_top=False,
            weights='imagenet',
            input_shape=input_shape,
            pooling='avg'
        )
        preprocess_func = tf.keras.applications.mobilenet_v2.preprocess_input
    elif architecture == 'EfficientNetB0':
        base_model = EfficientNetB0(
            include_top=False,
            weights='imagenet',
            input_shape=input_shape,
            pooling='avg'
        )
        preprocess_func = tf.keras.applications.efficientnet.preprocess_input
    else:
        raise ValueError(f"Unknown architecture: {architecture}")
    
    # Fine-tune last layers
    base_model.trainable = True
    for layer in base_model.layers[:-15]:
        layer.trainable = False
    
    # Build model
    inputs = keras.Input(shape=input_shape)
    x = preprocess_func(inputs)
    x = base_model(x, training=True)
    x = layers.Dropout(CONFIG['DROPOUT_RATE'])(x)
    x = layers.Dense(CONFIG['DENSE_UNITS'], activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['DROPOUT_RATE'] / 2)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    
    model = keras.Model(inputs, outputs)
    
    # Compile
    model.compile(
        optimizer=keras.optimizers.Adam(CONFIG['LEARNING_RATE']),
        loss='binary_crossentropy',
        metrics=['accuracy', keras.metrics.AUC(name='auc')]
    )
    
    return model

# Build model
print("Building model...")
model = build_model(CONFIG['ARCHITECTURE'])

print(f"\nModel: {CONFIG['ARCHITECTURE']}")
print(f"Total parameters: {model.count_params():,}")
trainable_params = sum([tf.keras.backend.count_params(w) for w in model.trainable_weights])
print(f"Trainable parameters: {trainable_params:,}")
print(f"Non-trainable parameters: {model.count_params() - trainable_params:,}")

# Show model summary
model.summary()

## 6. Train Model

In [None]:
# Callbacks
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_auc',
        patience=CONFIG['EARLY_STOPPING_PATIENCE'],
        mode='max',
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=CONFIG['REDUCE_LR_FACTOR'],
        patience=CONFIG['REDUCE_LR_PATIENCE'],
        min_lr=1e-7,
        verbose=1
    ),
    keras.callbacks.ModelCheckpoint(
        str(SAVE_DIR / 'best_model_local.h5'),
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1
    ),
    keras.callbacks.CSVLogger(
        str(SAVE_DIR / 'training_log.csv'),
        append=True
    )
]

# Train
print("\n" + "="*70)
print("TRAINING STARTED")
print("="*70)
print(f"Running on: CPU")
print(f"Epochs: {CONFIG['EPOCHS']}")
print(f"Batch size: {CONFIG['BATCH_SIZE']}")
print(f"Training samples: {len(train_data):,}")
print(f"Validation samples: {len(val_data):,}")
print("="*70 + "\n")

history = model.fit(
    train_generator,
    epochs=CONFIG['EPOCHS'],
    validation_data=val_generator,
    callbacks=callbacks,
    class_weight=class_weights,
    verbose=1
)

print("\n" + "="*70)
print("TRAINING COMPLETED")
print("="*70)

## 7. Training Results

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Loss
axes[0].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(history.history['val_loss'], label='Val Loss', linewidth=2)
axes[0].set_title('Loss Over Epochs', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(history.history['accuracy'], label='Train Acc', linewidth=2)
axes[1].plot(history.history['val_accuracy'], label='Val Acc', linewidth=2)
axes[1].set_title('Accuracy Over Epochs', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Accuracy', fontsize=12)
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

# AUC
axes[2].plot(history.history['auc'], label='Train AUC', linewidth=2)
axes[2].plot(history.history['val_auc'], label='Val AUC', linewidth=2)
axes[2].set_title('AUC Score Over Epochs', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Epoch', fontsize=12)
axes[2].set_ylabel('AUC', fontsize=12)
axes[2].legend(fontsize=11)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(SAVE_DIR / 'training_history.png', dpi=150, bbox_inches='tight')
plt.show()

# Print best metrics
print("\nTraining Summary:")
print("="*50)
print(f"Best Validation Loss: {min(history.history['val_loss']):.4f}")
print(f"Best Validation Accuracy: {max(history.history['val_accuracy']):.4f}")
print(f"Best Validation AUC: {max(history.history['val_auc']):.4f}")
print(f"\nFinal Training Loss: {history.history['loss'][-1]:.4f}")
print(f"Final Training Accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"Final Training AUC: {history.history['auc'][-1]:.4f}")
print("="*50)

## 8. Evaluate on Validation Set

In [None]:
# Get predictions
print("Generating predictions on validation set...")
val_predictions = model.predict(val_generator, verbose=1)
val_pred_proba = val_predictions.flatten()
val_pred_binary = (val_pred_proba > 0.5).astype(int)
val_true = val_data['label'].values

# Calculate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

val_accuracy = accuracy_score(val_true, val_pred_binary)
val_precision = precision_score(val_true, val_pred_binary)
val_recall = recall_score(val_true, val_pred_binary)
val_f1 = f1_score(val_true, val_pred_binary)
val_auc = roc_auc_score(val_true, val_pred_proba)

print("\nValidation Metrics:")
print("="*50)
print(f"Accuracy:  {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall:    {val_recall:.4f}")
print(f"F1 Score:  {val_f1:.4f}")
print(f"AUC Score: {val_auc:.4f}")
print("="*50)

# Confusion Matrix
cm = confusion_matrix(val_true, val_pred_binary)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Benign', 'Cancer'],
            yticklabels=['Benign', 'Cancer'],
            cbar_kws={'label': 'Count'})
axes[0].set_title('Confusion Matrix', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('True', fontsize=12)

# ROC Curve
fpr, tpr, thresholds = roc_curve(val_true, val_pred_proba)
axes[1].plot(fpr, tpr, linewidth=2, label=f'ROC (AUC = {val_auc:.4f})')
axes[1].plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
axes[1].set_title('ROC Curve', fontsize=14, fontweight='bold')
axes[1].set_xlabel('False Positive Rate', fontsize=12)
axes[1].set_ylabel('True Positive Rate', fontsize=12)
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(SAVE_DIR / 'evaluation_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

# Classification Report
print("\nDetailed Classification Report:")
print(classification_report(val_true, val_pred_binary, 
                          target_names=['Benign (0)', 'Cancer (1)']))

## 9. Generate Test Predictions with TTA

In [None]:
# Load test data
test_files = list(TEST_DIR.glob('*.tif'))
test_df = pd.DataFrame({
    'id': [f.stem for f in test_files],
    'filename': [f.name for f in test_files]
})

print(f"Test samples: {len(test_df):,}")

# Test-Time Augmentation
def predict_with_tta(model, test_df, n_tta=3):
    """
    Predictions with Test-Time Augmentation
    """
    all_predictions = []
    
    for tta_idx in tqdm(range(n_tta), desc="TTA Progress"):
        if tta_idx == 0:
            # No augmentation
            datagen = ImageDataGenerator(rescale=1./255)
        else:
            # With augmentation
            datagen = ImageDataGenerator(
                rescale=1./255,
                rotation_range=20,
                horizontal_flip=True,
                vertical_flip=True
            )
        
        test_gen = datagen.flow_from_dataframe(
            dataframe=test_df,
            directory=str(TEST_DIR),
            x_col='filename',
            y_col=None,
            target_size=(CONFIG['IMG_SIZE'], CONFIG['IMG_SIZE']),
            batch_size=CONFIG['BATCH_SIZE'],
            class_mode=None,
            shuffle=False
        )
        
        preds = model.predict(test_gen, verbose=0)
        all_predictions.append(preds)
    
    # Average predictions
    final_predictions = np.mean(all_predictions, axis=0)
    return final_predictions.flatten()

print("\nGenerating test predictions with TTA...")
test_predictions = predict_with_tta(model, test_df, n_tta=CONFIG['TTA_STEPS'])

print(f"\nPredictions complete!")
print(f"Shape: {test_predictions.shape}")
print(f"Range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean: {test_predictions.mean():.4f}")
print(f"Median: {np.median(test_predictions):.4f}")

## 10. Create Kaggle Submission

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'label': test_predictions
})

# Sort by id
submission = submission.sort_values('id').reset_index(drop=True)

# Save to project directory
submission_path = Path('D:/MS_in_AI/WK3_CNN_Detection/WK_3_CNN_Cancer_Detection/submission.csv')
submission.to_csv(submission_path, index=False)

print("âœ“ Submission file created!")
print(f"\nSaved to: {submission_path}")
print(f"\nSubmission shape: {submission.shape}")
print(f"\nFirst 10 rows:")
display(submission.head(10))

# Statistics
print(f"\nPrediction Statistics:")
print("="*50)
print(f"Min:    {submission['label'].min():.4f}")
print(f"Max:    {submission['label'].max():.4f}")
print(f"Mean:   {submission['label'].mean():.4f}")
print(f"Median: {submission['label'].median():.4f}")
print(f"Std:    {submission['label'].std():.4f}")
print("="*50)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(submission['label'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].axvline(0.5, color='red', linestyle='--', linewidth=2, label='Threshold=0.5')
axes[0].set_title('Distribution of Predictions', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Predicted Probability', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Binary predictions
binary_preds = (submission['label'] > 0.5).astype(int)
binary_counts = binary_preds.value_counts().sort_index()
binary_counts.plot(kind='bar', ax=axes[1], color=['green', 'red'], alpha=0.7)
axes[1].set_title('Predicted Classes (threshold=0.5)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Class', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_xticklabels(['Benign (0)', 'Cancer (1)'], rotation=0)
for i, v in enumerate(binary_counts):
    axes[1].text(i, v + 500, f'{v:,}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig(SAVE_DIR / 'submission_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n" + "="*70)
print("ALL DONE! READY TO SUBMIT TO KAGGLE!")
print("="*70)
print(f"\nSubmission file: {submission_path}")
print(f"Model file: {SAVE_DIR / 'best_model_local.h5'}")
print(f"\nUpload submission.csv to:")
print(f"https://www.kaggle.com/c/histopathologic-cancer-detection/submit")
print("\n" + "="*70)

## Summary

### Model Configuration:
- **Architecture**: MobileNetV2 (CPU-optimized)
- **Training**: Local machine (CPU)
- **Data**: Full dataset from C: drive
- **Augmentation**: Rotation, flips, shifts, zoom
- **TTA**: 3 augmentation steps

### Next Steps:
1. âœ… Model trained and evaluated
2. âœ… Submission file created
3. ðŸ“¤ Upload `submission.csv` to Kaggle
4. ðŸ“Š Check leaderboard score
5. ðŸ“¸ Take screenshot of leaderboard

### To Improve Score:
- Upload and run the Kaggle competition notebook (GPU accelerated)
- Use EfficientNetB3/B4 instead of MobileNetV2
- Train for more epochs (25-30)
- Increase TTA steps to 5-10
- Train multiple folds and ensemble

**Good luck! ðŸŽ¯**