## Generate Datasets

### Dataset Distribution

In order to look at the dataset distribution and evaluate its balance, I would create a dataframe with 2 columns:
- 1st column: name of the image file
- 2nd column: label for that image

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2025-06-24 14:13:16.507166: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-24 14:13:16.522746: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750767196.543165   39878 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750767196.550049   39878 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750767196.564893   39878 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# GET THE DIRECTORY TO THE DATASETS
path_data = '../chest_Xray/'
path_train = os.path.join(path_data, 'train')
path_val = os.path.join(path_data, 'val')
path_test = os.path.join(path_data, 'test')

# INITIATE EMPTY LIST OF TRAINING DATA
# THAT WILL BE CONVERTED TO PANDAS DATAFRAME LATER
data_train = []
data_val = []
data_test = []
data = [data_train, data_val, data_test]

# ITERATE THROUGH EACH PATH
for i, d in enumerate([path_train, path_val, path_test]):
    # GET THE PATH TO EACH FOLDER - NORMAL AND PNEUMONIA
    normal = os.path.join(d, 'NORMAL')
    pneumonia = os.path.join(d, 'PNEUMONIA')
    
    # FETCH ALL .jpeg FILES
    normal_imgs = Path(normal).glob('*.jpeg')
    pneumonia_imgs = Path(pneumonia).glob('*.jpeg')
    
    # APPEND THE NAME AND CORRESPONDING LABEL (BASED ON FOLDER NAME)
    # TO data_train list AS TUPLES
    for img in normal_imgs:
        data[i].append((str(img), 0))
    
    for img in pneumonia_imgs:
        data[i].append((str(img), 1))

In [3]:
# CONVERT data_train TO DATAFRAME FOR EASY MANIPULATION
columns = ['image', 'label']
train_df = pd.DataFrame(data_train, columns=columns)
val_df = pd.DataFrame(data_val, columns=columns)
test_df = pd.DataFrame(data_test, columns=columns)

In [4]:
# CONCAT train_df and val_df
master_df = pd.concat([train_df, val_df])

In [5]:
def cat_pneumonia_type(x):
    if 'bacteria' in x:
        output = 'bacteria'
    elif 'virus' in x:
        output = 'virus'
    else:
        output = 'normal'
    return output

In [6]:
master_df['type'] = master_df.image.apply(lambda x: cat_pneumonia_type(x))

In [7]:
master_df.type.value_counts()

type
bacteria    2538
normal      1349
virus       1345
Name: count, dtype: int64

In [8]:
train_df, val_df = train_test_split(master_df, test_size=0.2, random_state=42)

### Data Generators

In [9]:
BATCH_SIZE = 32

# CREATE GENERATOR FOR TRAINING DATA (FROM train_df DATAFRAME)
train_generator = ImageDataGenerator(rescale=1.0/255).flow_from_dataframe(
    dataframe=train_df, x_col='image', y_col='label',
    color_mode='grayscale',
    class_mode='raw',
    target_size=(128,128), 
    batch_size=BATCH_SIZE,
    seed=42,
    shuffle=True   # shuffle so we're not missing out on any samples
)

# CREATE GENERATOR FOR VALIDATION DATA (FROM val_df DATAFRAME)
val_generator = ImageDataGenerator(rescale=1.0/255).flow_from_dataframe(
    dataframe=val_df, x_col='image', y_col='label',
    color_mode='grayscale',
    class_mode='raw',
    target_size=(128,128), 
    batch_size=BATCH_SIZE,
    seed=42,
    shuffle=True
)

Found 4185 validated image filenames.


Found 1047 validated image filenames.


In [10]:
# CHECKING THE SHAPE OF 1 BATCH OF THE train_generator
images_train, labels_train = next(train_generator)
print(f'Training set: {images_train.shape} | {labels_train.shape}')

Training set: (32, 128, 128, 1) | (32,)


## Modeling

In [11]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Dropout
from keras.metrics import Recall, Precision
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers.schedules import ExponentialDecay
import matplotlib.pyplot as plt

### Baseline Model

Baseline Convolution Neural Networks (CNN) model with:

- One 2D Convolution layer with padding (same) and ReLU activation
- One 2D MaxPooling layer
- A Flatten layer to convert the 2D feature maps into a 1D vector
- One hidden Dense (fully connected) layer with ReLU activation
- One output Dense layer with a sigmoid activation for binary classification



In [12]:
baseline = Sequential()
baseline.add(Conv2D(32, (3,3), activation='relu', padding='same', input_shape=[128,128,1]))
baseline.add(MaxPooling2D(pool_size=(2,2)))
baseline.add(Flatten())
baseline.add(Dense(32, activation='relu'))
baseline.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1750767200.267214   39878 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4124 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [13]:
recall = Recall()
precision = Precision()

In [14]:
baseline.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy', recall, precision])

In [15]:
baseline.summary()

In [16]:
checkpoint_cb = ModelCheckpoint('baseline.h5',
                                save_best_only=True)
early_stopping_cb = EarlyStopping(patience=10, restore_best_weights=True)

BATCH_SIZE = 32
EPOCHS = 30

results_1 = baseline.fit(train_generator, 
                         validation_data=val_generator,
                         epochs=EPOCHS,
                         steps_per_epoch=(train_generator.n//BATCH_SIZE),
                         validation_steps=(val_generator.n//BATCH_SIZE),
                         callbacks=[checkpoint_cb, early_stopping_cb])

  self._warn_if_super_not_called()


Epoch 1/30


I0000 00:00:1750767202.818312   39964 service.cc:152] XLA service 0x7f20540043c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1750767202.818335   39964 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-06-24 14:13:22.870771: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1750767203.078434   39964 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m  1/130[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7:41[0m 4s/step - accuracy: 0.8438 - loss: 0.6567 - precision: 0.8438 - recall: 1.0000

I0000 00:00:1750767205.113813   39964 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step - accuracy: 0.8575 - loss: 0.5795 - precision: 0.8827 - recall: 0.9402



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 220ms/step - accuracy: 0.8579 - loss: 0.5772 - precision: 0.8831 - recall: 0.9403 - val_accuracy: 0.9590 - val_loss: 0.1082 - val_precision: 0.9580 - val_recall: 0.9865
Epoch 2/30
[1m  1/130[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 46ms/step - accuracy: 1.0000 - loss: 0.0425 - precision: 1.0000 - recall: 1.0000



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 1.0000 - loss: 0.0425 - precision: 1.0000 - recall: 1.0000 - val_accuracy: 0.9648 - val_loss: 0.0987 - val_precision: 0.9706 - val_recall: 0.9811
Epoch 3/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step - accuracy: 0.9663 - loss: 0.0938 - precision: 0.9745 - recall: 0.9807



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 202ms/step - accuracy: 0.9663 - loss: 0.0938 - precision: 0.9745 - recall: 0.9807 - val_accuracy: 0.9648 - val_loss: 0.0900 - val_precision: 0.9668 - val_recall: 0.9851
Epoch 4/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 1.0000 - loss: 0.0330 - precision: 1.0000 - recall: 1.0000 - val_accuracy: 0.9629 - val_loss: 0.0922 - val_precision: 0.9617 - val_recall: 0.9878
Epoch 5/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step - accuracy: 0.9723 - loss: 0.0702 - precision: 0.9797 - recall: 0.9830



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 202ms/step - accuracy: 0.9723 - loss: 0.0702 - precision: 0.9798 - recall: 0.9830 - val_accuracy: 0.9697 - val_loss: 0.0750 - val_precision: 0.9758 - val_recall: 0.9824
Epoch 6/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.9375 - loss: 0.0938 - precision: 0.9545 - recall: 0.9545 - val_accuracy: 0.9658 - val_loss: 0.0807 - val_precision: 0.9876 - val_recall: 0.9649
Epoch 7/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 202ms/step - accuracy: 0.9869 - loss: 0.0373 - precision: 0.9922 - recall: 0.9903 - val_accuracy: 0.9639 - val_loss: 0.0873 - val_precision: 0.9916 - val_recall: 0.9583
Epoch 8/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 1.0000 - loss: 0.0188 - precision: 1.0000 - recall: 1.0000 - val_accuracy: 0.9609 - val_loss: 0.1010 - val_precision: 0.9916 - val_recall: 0.9540
Epoch 9/30
[1m130/130[0m 



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 200ms/step - accuracy: 0.9899 - loss: 0.0295 - precision: 0.9939 - recall: 0.9926 - val_accuracy: 0.9678 - val_loss: 0.0744 - val_precision: 0.9706 - val_recall: 0.9851
Epoch 10/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 1.0000 - loss: 0.0048 - precision: 1.0000 - recall: 1.0000 - val_accuracy: 0.9707 - val_loss: 0.0814 - val_precision: 0.9670 - val_recall: 0.9932
Epoch 11/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step - accuracy: 0.9943 - loss: 0.0222 - precision: 0.9966 - recall: 0.9957



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 204ms/step - accuracy: 0.9943 - loss: 0.0222 - precision: 0.9966 - recall: 0.9957 - val_accuracy: 0.9717 - val_loss: 0.0656 - val_precision: 0.9851 - val_recall: 0.9758
Epoch 12/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 1.0000 - loss: 0.0232 - precision: 1.0000 - recall: 1.0000 - val_accuracy: 0.9697 - val_loss: 0.0722 - val_precision: 0.9848 - val_recall: 0.9728
Epoch 13/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 197ms/step - accuracy: 0.9991 - loss: 0.0126 - precision: 0.9998 - recall: 0.9991 - val_accuracy: 0.9688 - val_loss: 0.0743 - val_precision: 0.9758 - val_recall: 0.9811
Epoch 14/30
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 1.0000 - loss: 0.0223 - precision: 1.0000 - recall: 1.0000 - val_accuracy: 0.9697 - val_loss: 0.0747 - val_precision: 0.9758 - val_recall: 0.9824
Epoch 15/30
[1m130/130

In [None]:
def visualize_training_results(results):
    history = results.history
    
    for m in ['loss', 'accuracy', 'precision', 'recall']:
        plt.figure(figsize=(15,5))
        plt.plot(history[m], label='Training', marker='o', color='blue')
        plt.plot(history[f'val_{m}'], label='Validation', marker='o', color='red')
        plt.xlabel('Epochs')
        plt.legend()
        plt.title(m.title())
    
    plt.show();

In [18]:
visualize_training_results(results_1)