In [None]:
# Import necessary libraries

# os: A built-in Python library for interacting with the operating system. 
# We'll use it to work with file paths.
import os

# numpy: A fundamental package for numerical computation in Python.
# It provides support for large, multi-dimensional arrays and matrices, 
# along with a large collection of high-level mathematical functions to operate on these arrays.
import numpy as np

# matplotlib.pyplot: A plotting library for the Python programming language and its numerical mathematics extension NumPy.
# We'll use it to visualize our images and plot graphs of our training and validation metrics.
import matplotlib.pyplot as plt

# pandas: A fast, powerful, flexible and easy to use open source data analysis and manipulation tool.
# While we might not use it heavily for image data, it's a good practice to have it ready for any tabular data we might encounter.
import pandas as pd

# tensorflow: An end-to-end open source platform for machine learning.
# It has a comprehensive, flexible ecosystem of tools, libraries and community resources that lets researchers push the state-of-the-art in ML 
# and developers easily build and deploy ML powered applications.
import tensorflow as tf

# ImageDataGenerator: A class from the Keras library (which is part of TensorFlow) that allows to build Python generators for image data.
# These generators can be used to automatically load, preprocess, and augment images in real-time during model training.
# This is a very memory-efficient way to work with large datasets.
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
# Define the paths to your training and testing directories.
# It's a good practice to define these as variables at the top of your script,
# so you can easily change them if you move your data around.
train_dir = '../data/chestxrays/train'
test_dir = '../data/chestxrays/test'

# Get the number of images in the training and validation directories
num_train = sum([len(files) for r, d, files in os.walk(train_dir)])
num_val = sum([len(files) for r, d, files in os.walk(test_dir)])

# Define some parameters
batch_size = 20
epochs = 30
IMG_HEIGHT = 150
IMG_WIDTH = 150

# Create an ImageDataGenerator for the training set.
# This will be used to generate batches of tensor image data with real-time data augmentation.
# The data will be looped over (in batches).
train_datagen = ImageDataGenerator(
    # Rescale the pixel values from [0, 255] to [0, 1]. 
    # Neural networks generally work better with small input values.
    rescale=1./255,
    # The following are data augmentation parameters.
    # Data augmentation is a technique to artificially increase the size of your training dataset
    # by creating modified versions of the images in the dataset.
    # This helps to prevent overfitting and makes the model more robust.

    # rotation_range is a value in degrees (0-180), a range within which to randomly rotate pictures.
    rotation_range=20,
    # width_shift_range and height_shift_range are ranges (as a fraction of total width or height) 
    # within which to randomly translate pictures vertically or horizontally.
    width_shift_range=0.2,
    height_shift_range=0.2,
    # shear_range is for randomly applying shearing transformations.
    shear_range=0.2,
    # zoom_range is for randomly zooming inside pictures.
    zoom_range=0.2,
    # horizontal_flip is for randomly flipping half of the images horizontally. 
    # This is relevant for X-ray images as the left and right sides are generally symmetrical.
    horizontal_flip=True,
    # fill_mode is the strategy used for filling in newly created pixels, 
    # which can appear after a rotation or a width/height shift.
    fill_mode='nearest'
)

# Create an ImageDataGenerator for the test set.
# For the test set, we only need to rescale the images. 
# We don't apply data augmentation to the test set because we want to evaluate the model on the original, unmodified images.
test_datagen = ImageDataGenerator(rescale=1./255)

# Create the data generators using the .flow_from_directory() method.
# This method is very convenient as it allows you to read the images directly from the directories, 
# and it automatically labels the images based on the directory names.
train_generator = train_datagen.flow_from_directory(
    train_dir, # The path to the training directory.
    target_size=(IMG_HEIGHT, IMG_WIDTH),  # All images will be resized to 150x150.
    # It's important to use a consistent image size for the model.
    batch_size=batch_size, # The number of images to generate from the generator per batch.
    class_mode='binary'  # Since we have two classes (healthy and tb), we use 'binary' class mode.
    # This means the labels will be 0 or 1.
)

validation_generator = test_datagen.flow_from_directory(
    test_dir, # The path to the testing directory.
    target_size=(IMG_HEIGHT, IMG_WIDTH), # All images will be resized to 150x150.
    batch_size=batch_size, # The number of images to generate from the generator per batch.
    class_mode='binary' # Since we have two classes (healthy and tb), we use 'binary' class mode.
)

In [None]:
# Build the CNN model

# We will use the Keras Sequential API, which allows us to create models layer-by-layer.
# This is a simple and intuitive way to build models.
model = tf.keras.models.Sequential([
    # The first layer is a convolutional layer. 
    # It has 32 filters of size 3x3. The activation function is 'relu' (Rectified Linear Unit), which is a common choice for CNNs.
    # The input_shape is the size of our images (150x150) and the number of color channels (3 for RGB).
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
    
    # The next layer is a max pooling layer.
    # It takes the maximum value from a 2x2 pool, which helps to reduce the spatial dimensions of the feature maps.
    tf.keras.layers.MaxPooling2D(2, 2),
    
    # We add another convolutional layer and a max pooling layer.
    # It's common to stack multiple convolutional and pooling layers to allow the network to learn more complex features.
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    # Another convolutional and max pooling layer.
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    # Another convolutional and max pooling layer.
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    # Flatten the results to feed into a DNN
    # This layer converts the 2D feature maps into a 1D vector.
    tf.keras.layers.Flatten(),
    
    # A fully connected (Dense) layer with 512 neurons.
    tf.keras.layers.Dense(512, activation='relu'),
    
    # The output layer. It has a single neuron with a sigmoid activation function.
    # The sigmoid function outputs a value between 0 and 1, which we can interpret as the probability of the image belonging to the 'tb' class.
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Print a summary of the model.
# This is a good way to check the architecture of your model and the number of parameters.
model.summary()

In [None]:
# Compile the model

# Before we can train the model, we need to configure the learning process. 
# This is done using the compile() method.
model.compile(
    # The optimizer is the algorithm that will be used to update the weights of the model.
    # We will use the Adam optimizer, which is a popular and effective choice for many deep learning models.
    optimizer='adam',
    # The loss function is what the model will try to minimize during training.
    # For a binary classification problem like ours, 'binary_crossentropy' is the standard choice.
    loss='binary_crossentropy',
    # The metrics are used to monitor the training and testing steps.
    # We will use 'accuracy' to measure the percentage of correctly classified images.
    metrics=['accuracy']
)

In [None]:
# Train the model

# We will now train the model using the fit() method.
# This method will train the model for a fixed number of epochs (iterations on a dataset).
history = model.fit(
    # The training data generator.
    train_generator,
    # The number of steps to take in each epoch. 
    # This is usually the number of training samples divided by the batch size.
    steps_per_epoch=num_train // batch_size,
    # The number of epochs to train the model for.
    epochs=epochs,
    # The validation data generator.
    validation_data=validation_generator,
    # The number of steps to take in each validation epoch.
    validation_steps=num_val // batch_size
)

In [None]:
# Plot the training and validation accuracy and loss

# We can now plot the training and validation accuracy and loss to see how the model performed.
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
# Save the model

# It's a good practice to save your trained model so you can reuse it later without having to retrain it.
# We will save the model in the HDF5 format, which is a common format for saving Keras models.
model.save('tuberculosis_model.h5')