# Pneumonia Detection on Chest X-Rays with CNN (TensorFlow/Keras)

_End-to-end training and evaluation with accuracy/loss curves, confusion matrix, ROC, misclassified cases, and best model (.keras)._

**Task:** Binary classification (Normal vs Pneumonia)  
**Framework:** TensorFlow/Keras  
**Dataset:** Chest X-Ray Images (Pneumonia) (Kermany et al., via Kaggle)


In [None]:
# Import libraries

# General-purpose
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

# Deep learning (TensorFlow & Keras)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense, Conv2D, MaxPooling2D, Flatten, Dropout,
                                     BatchNormalization, GlobalAveragePooling2D, Input)
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Evaluation (scikit-learn)
from sklearn.metrics import (classification_report, confusion_matrix,
                             ConfusionMatrixDisplay, roc_auc_score, roc_curve, auc)

# Set seeds
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Download and unzip Chest X-Ray Pneumonia dataset

# Install Kaggle API
# !pip install -q kaggle

# If running locally or in Colab, upload your kaggle.json here:
# from google.colab import files
# files.upload()  # Upload kaggle.json manually

# Or manually place kaggle.json in ~/.kaggle/
os.makedirs('/root/.kaggle', exist_ok=True)
# os.rename('kaggle.json', '/root/.kaggle/kaggle.json')  # Uncomment if using upload method
os.chmod('/root/.kaggle/kaggle.json', 600)

# Download and unzip the dataset
# !kaggle datasets download -d paultimothymooney/chest-xray-pneumonia
# !unzip -q chest-xray-pneumonia.zip -d ./pneumonia_data

# NOTE: For manual setup (e.g., GitHub), place the dataset manually in:
# ./pneumonia_data/chest_xray
# Dataset link: https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia

In [None]:
# Visualize a sample image from each class

train_dir = './pneumonia_data/chest_xray/train'
classes = ['NORMAL', 'PNEUMONIA']

plt.figure(figsize=(10, 5))
for i, label in enumerate(classes):
    class_dir = os.path.join(train_dir, label)
    image_file = random.choice(os.listdir(class_dir))
    image_path = os.path.join(class_dir, image_file)

    img = mpimg.imread(image_path)

    plt.subplot(1, 2, i+1)
    plt.imshow(img, cmap='gray')
    plt.title(label)
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Data Preprocessing with better augmentation and internal validation split

# Image parameters
img_size = (150, 150)
batch_size = 16

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    brightness_range=[0.99, 1.01],
    fill_mode='nearest',
    validation_split=0.2  # 20% of training data for validation
)

# Train generator (80% of train directory)
train_generator = train_datagen.flow_from_directory(
    '/content/pneumonia_data/chest_xray/train',
    target_size=img_size,
    batch_size=batch_size,
    color_mode='grayscale',
    class_mode='binary',
    subset='training',
    shuffle=True,
    seed=42
)

# Validation generator (20% of train directory)
val_generator = train_datagen.flow_from_directory(
    '/content/pneumonia_data/chest_xray/train',
    target_size=img_size,
    batch_size=batch_size,
    color_mode='grayscale',
    class_mode='binary',
    subset='validation',
    shuffle=False,
    seed=42
)

# Test generator (unchanged - only rescaling, no augmentation)
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    '/content/pneumonia_data/chest_xray/test',
    target_size=img_size,
    batch_size=batch_size,
    color_mode='grayscale',
    class_mode='binary',
    shuffle=False
)

print("Train samples:", train_generator.samples)
print("Validation samples:", val_generator.samples)
print("Test samples:", test_generator.samples)

In [None]:
# Build a simple CNN model using Keras

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 1)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Dropout(0.8))

model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.005)))
model.add(Dropout(0.8))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# Compile the model
learning_rate = 0.00025

model.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Train the model

checkpoint_path = 'best_model.keras'

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=4,
    restore_best_weights=True,
    verbose=1
)

lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.4,
    patience=2,
    min_lr=1e-6,
    verbose=1
)

checkpoint = ModelCheckpoint(
    checkpoint_path,
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

history = model.fit(
    train_generator,
    epochs=20,
    validation_data=val_generator,
    callbacks=[checkpoint, early_stopping, lr_scheduler],
    class_weight = {0: 1.0, 1: 0.86}
)

# files.download(checkpoint_path)

In [None]:
# Plot training and validation accuracy and loss

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Train Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Train Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# Evaluate model on test data

y_pred_prob = model.predict(test_generator)
y_pred = (y_pred_prob > 0.5).astype(int)

y_true = test_generator.classes

print("Classification Report:")
print(classification_report(y_true, y_pred))

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=test_generator.class_indices.keys())
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
# ROC Curve and AUC for test data

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
roc_auc = roc_auc_score(y_true, y_pred_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Display some misclassified test images

misclassified_idx = np.where(y_true != y_pred.flatten())[0]

plt.figure(figsize=(12, 6))
for i, idx in enumerate(misclassified_idx[:6]):
    img_path = test_generator.filepaths[idx]
    img = mpimg.imread(img_path)

    plt.subplot(2, 3, i+1)
    plt.imshow(img, cmap='gray')
    true_label = 'NORMAL' if y_true[idx] == 0 else 'PNEUMONIA'
    pred_label = 'NORMAL' if y_pred[idx] == 0 else 'PNEUMONIA'
    plt.title(f'True: {true_label}\nPred: {pred_label}')
    plt.axis('off')

plt.tight_layout()
plt.show()