# Histopathologic Cancer Detection

This notebook is for the Kaggle competition on detecting metastatic cancer in small histopathologic image patches. The goal is to build a binary classification model to predict the probability of tumor presence in each image.

In [None]:

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Set data directory
DATA_DIR = '../input/histopathologic-cancer-detection/'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')
LABELS_FILE = os.path.join(DATA_DIR, 'train_labels.csv')


In [None]:

# Load labels
df = pd.read_csv(LABELS_FILE)

# Display basic dataset info
print(df.head())
print(f"Total images: {len(df)}")
print(f"Class distribution:\n{df['label'].value_counts()}")

# Visualizing class distribution
sns.countplot(x=df['label'])
plt.title('Class Distribution')
plt.show()


In [None]:

# Display some sample images
def show_images(image_ids, labels, rows=2, cols=5):
    fig, axes = plt.subplots(rows, cols, figsize=(12, 6))
    for i, ax in enumerate(axes.flat):
        img_path = os.path.join(TRAIN_DIR, image_ids[i] + '.tif')
        img = plt.imread(img_path)
        ax.imshow(img)
        ax.set_title(f"Label: {labels[i]}")
        ax.axis('off')
    plt.show()

sample = df.sample(10)
show_images(sample['id'].values, sample['label'].values)


In [None]:

# Data preprocessing
IMG_SIZE = 96  # Resize images
BATCH_SIZE = 32

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    validation_split=0.2  # 20% for validation
)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=df,
    directory=TRAIN_DIR,
    x_col='id',
    y_col='label',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='training'
)

val_generator = train_datagen.flow_from_dataframe(
    dataframe=df,
    directory=TRAIN_DIR,
    x_col='id',
    y_col='label',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation'
)


In [None]:

# Build a simple CNN model
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
    MaxPooling2D(2,2),
    BatchNormalization(),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    BatchNormalization(),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:

# Train model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10
)


In [None]:

# Plot training history
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.show()


In [None]:

# Save the model
model.save('cancer_detection_model.h5')


In [None]:

# Generate predictions for the test set
test_ids = os.listdir(TEST_DIR)
test_images = [os.path.join(TEST_DIR, img_id) for img_id in test_ids]

def load_and_preprocess_image(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(IMG_SIZE, IMG_SIZE))
    img_array = tf.keras.preprocessing.image.img_to_array(img) / 255.0
    return img_array

test_data = np.array([load_and_preprocess_image(img) for img in test_images])

predictions = model.predict(test_data)

# Prepare submission file
submission_df = pd.DataFrame({'id': [img.split('/')[-1].split('.')[0] for img in test_images],
                              'label': predictions.flatten()})
submission_df.to_csv('submission.csv', index=False)
print("Submission file saved!")
