<a href="https://colab.research.google.com/github/denistoo749/Malaria-Cell-Classification/blob/main/malaria_cell_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Create a model which is able to classify whether a blood smear is uninfected or parasitized.

# Malaria Cell Classification

This notebook creates a model which is able to classify whether a blood smear is uninfected or parasitized. using TensorFlow  and TensorFlow Hub.

## 1. Problem:
Classify whether a blood smear is uninfected or parasitized. using TensorFlow  and TensorFlow Hub.

## 2. Data:
The data we're using is from One of the datasets under the Tensorflow Image libraries is a malaria dataset which contains approximately 27,500 cell images of parasitized and uninfected cells from thin blood smear slide images of segmented cells.

https://data.lhncbc.nlm.nih.gov/public/Malaria/cell_images.zip

## 3. Evaluation:
The evaluation is a file with prediction probabilties for each cell image of each test image.

## 4. Features:
Some information about the data:
* The Malaria dataset contains a total of 27,558 cell images with equal instances of parasitized and uninfected cells from the thin blood smear slide images of segmented cells.


In [1]:
# Unzip zipped file uploaded to the Google Drive
!unzip '/content/drive/MyDrive/Malaria Cell Classification/cell_images.zip' -d '/content/drive/MyDrive/Malaria Cell Classification/'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/C236ThinF_IMG_20151127_102428_cell_118.png  
 extracting: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/C236ThinF_IMG_20151127_102428_cell_126.png  
 extracting: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/C236ThinF_IMG_20151127_102428_cell_134.png  
 extracting: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/C236ThinF_IMG_20151127_102428_cell_141.png  
 extracting: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/C236ThinF_IMG_20151127_102428_cell_168.png  
 extracting: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/C236ThinF_IMG_20151127_102428_cell_175.png  
 extracting: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/C236ThinF_IMG_20151127_102428_cell_183.png  
 extracting

## Getting Workspace ready:
* Import tensorflow ✅
* Import TensorFlow Hub ✅
* Make sure we're using a GPU ✅



In [None]:
# Import necessary tools
import tensorflow as tf
import tensorflow_hub as hub
print(f'TF version: {tf.__version__}')
print(f'TF Hub version: {hub.__version__}')

# Check for GPU availability
print('GPU', 'avilability (YESSSS!!!!!)' if tf.config.list_physical_devices('GPU') else 'not available :(')

TF version: 2.15.0
TF Hub version: 0.16.1
GPU avilability (YESSSS!!!!!)


In [2]:
import os

# Define folder paths
folder_paths = [
    "/content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected",
    "/content/drive/MyDrive/Malaria Cell Classification/cell_images/Parasitized"
]

# Iterate over folder paths
for folder_path in folder_paths:
    thumbs_db_path = os.path.join(folder_path, "Thumbs.db")
    # Check if the file exists before attempting to delete it
    if os.path.exists(thumbs_db_path):
        os.remove(thumbs_db_path)
        print(f"Thumbs.db file deleted: {thumbs_db_path}")
    else:
        print(f"Thumbs.db file not found in {folder_path}.")

Thumbs.db file deleted: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected/Thumbs.db
Thumbs.db file deleted: /content/drive/MyDrive/Malaria Cell Classification/cell_images/Parasitized/Thumbs.db


## Combining images from Parasitized and Uninfected folders into images folder, shuffling the images and maintaining the labels of whether the cell is Parasitized or Uninfected.

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import random

# Function to read and resize images from a folder
def read_and_resize_images(folder_path, label, num_images):
    images = []
    image_ids = []
    labels = []  # Initialize labels list
    filenames = os.listdir(folder_path)
    random.shuffle(filenames) # Shuffle filenames randomly
    for filename in filenames[:num_images]: # Select only the specified number of images
        if filename.endswith('.jpg') or filename.endswith('.png'): # You can add more formats if needed
            image_path = os.path.join(folder_path, filename)
            try:
                img = cv2.imread(image_path) # Read image using OpenCV
                if img is None:
                    print("Failed to read image:", image_path)
                    continue
                img_resized = cv2.resize(img, target_size) # Resize image to target size
                images.append(img_resized)
                image_ids.append(filename)  # Store image id (filename)
                labels.append(label) # Add label for each image
            except Exception as e:
                print("Error reading image:", image_path)
                print(e)
    return images, image_ids, labels  # Return both images, image ids, and labels

# Define the target size for resizing images
target_size = (128, 128) # Adjust as needed

# Define the paths to the folders containing images
parasitized_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/Parasitized'
uninfected_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected'

# Specify the number of images to use from each folder
num_images_per_folder = 10000

# Read and resize the specified number of images from both folders
images_parasitized, image_ids_parasitized, labels_parasitized = read_and_resize_images(parasitized_path, label='parasitized', num_images=num_images_per_folder)
images_uninfected, image_ids_uninfected, labels_uninfected = read_and_resize_images(uninfected_path, label='uninfected', num_images=num_images_per_folder)

# Combine the images, image ids, and labels from both folders
combined_images = images_parasitized + images_uninfected
combined_image_ids = image_ids_parasitized + image_ids_uninfected
combined_labels = labels_parasitized + labels_uninfected

# Create a dictionary to map image IDs to images
id_to_image = dict(zip(combined_image_ids, combined_images))

# Combine image ids and labels into a DataFrame
df = pd.DataFrame({'id': combined_image_ids, 'image': combined_images, 'label': combined_labels})

# Shuffle the DataFrame
df_shuffled = df.sample(frac=1).reset_index(drop=True)

# Define the output folder path
output_folder = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/images'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Write the combined images to the output folder
for i, image_data in enumerate(combined_images):
    # Generate the file path for the image
    image_id = combined_image_ids[i]
    image_label = combined_labels[i]
    file_path = os.path.join(output_folder, f"{image_id}")

    # Save the image using OpenCV
    cv2.imwrite(file_path, image_data)

# Print success message
print(f"{num_images_per_folder} images from both folders have been successfully saved to folder: {output_folder}")

In [None]:
df_shuffled.head()

In [None]:
# Display image at certain index
import matplotlib.pyplot as plt

# Select the index of the image you want to display
image_index = 10  # Change this index to display a different image

# Select the image and label using the index
selected_image = df_shuffled['image'][image_index]
selected_label = df_shuffled['label'][image_index]

# Display the selected image
plt.imshow(selected_image)
plt.title(selected_label)
plt.axis('off')
plt.show()

In [None]:
# Create image paths from image IDs
image_folder_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/images'
image_paths = [os.path.join(image_folder_path, image_id) for image_id in df_shuffled['id'].tolist()]

# Check out the first 10 image paths
image_paths[:10]

In [None]:
# Create a new DataFrame with image labels instead of pixel values
image_labels_df = pd.DataFrame({'label': df_shuffled['label']})

# Select the top N most frequent image labels
top_n_labels = 10  # Change this value as needed
top_labels = image_labels_df['label'].value_counts().nlargest(top_n_labels)

# Generate the bar plot
top_labels.plot.bar(figsize=(1, 5))

# Add title and labels
plt.title('Top {} Most Frequent Image Labels'.format(top_n_labels))
plt.xlabel('Label')
plt.ylabel('Count')

# Show the plot
plt.show()

In [None]:
# Prepare labels
labels = df_shuffled['label'].to_numpy()
labels

In [None]:
len(labels)

In [None]:
unique_labels = np.unique(labels)
unique_labels

In [None]:
# Turn a single label into an array of booleans
print(labels[0])
labels[0] == unique_labels

In [None]:
# Turn every label into a boolean array
boolean_labels = [label == unique_labels for label in labels]
boolean_labels[:2]

In [None]:
len(boolean_labels)

In [None]:
# Example: Turning boolean array into integers
print(labels[140]) # Original label
print(np.where(unique_labels == labels[140])) # index where label occurs
print(boolean_labels[140].argmax()) # index where label occurs in the boolean array
print(boolean_labels[140].astype(int)) # there will be a 1 where the sample label occurs

In [None]:
print(labels[5]) # Original label
print(np.where(unique_labels == labels[5])) # index where label occurs
print(boolean_labels[5].argmax()) # index where label occurs in the boolean array
print(boolean_labels[5].astype(int)) # there will be a 1 where the sample label occurs

## Creating validation set

In [None]:
# Set up x & y variables
x = image_paths
y = boolean_labels

In [None]:
len(image_paths)

## Changing the size of the images to start with abd increase as needed

In [None]:
# Set number of images to use for experimenting
NUM_IMAGES = 1000 #@param {type:'slider', min:1000, max:10000, step:1000}

In [None]:
# Let's split our data into train and validation sets
from sklearn.model_selection import train_test_split

# Split them into training and validation of total size NUM_IMAGES
x_train, x_val, y_train, y_val = train_test_split(x[:NUM_IMAGES], y[:NUM_IMAGES], test_size=0.2, random_state=42)

len(x_train), len(x_val), len(y_train), len(y_val)

In [None]:
# Let's have a geez at the training data
x_train[:15], y_train[:5]

# Preprocessing images (Turning images into Tensors)
To preprocess our images into Tensors we're going to write a function which does a few things:
* Take an image filepath as input
* Use a TensorFlow to read the file and save it to a variable, image
* Turn our image (a jpg) into Tensors
* Resize the image to be a shape of (224, 224)
* Return the modified image

Before we do, lets see what importing an image looks like.

In [None]:
# Define image size
IMG_SIZE = 224

# Create a function for preprocessing images
def process_image(image_path, img_size=IMG_SIZE):
  """
  Take an image file path and turns the image into Tensors
  """
  # Read in an image file
  image = tf.io.read_file(image_path)
  # Turn the jpeg image into numerical Tensor with 3 color channels (Red, Green, Blue)
  image = tf.image.decode_jpeg(image, channels=3)
  # Convert the color channel values from 0-255 to 0-1 values
  image = tf.image.convert_image_dtype(image, tf.float32)
  # Resize the image to our desired value (224, 224)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])

  return image

In [None]:
# Create a simple function to return a tuple (image, label)
def get_image_label(image_path, label):
  """
  Takes an image file path name and the associated label, preprocess the image
  and returns a tuple of (image, label).
  """
  image = process_image(image_path)
  return image, label

In [None]:
# Demo of the above
(process_image(x[6], tf.constant(y[6])))

In [None]:
# Define the batch size, 32 is a good start
BATCH_SIZE = 32

# Create a function to turn data into batches
def create_data_batches(x, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
  """
  Create batches of data out of image (x) and label (y) pairs.
  Shuffles the data if it's training data but doesn't shuffle if it's validation data.
  Also accepts test data as input (no labels)
  """
  # If the data is a test dataset, we probably don't have labels
  if test_data:
    print('Creating test data batches...')
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x))) # only filepaths (no labels)
    data_batch = data.map(process_image).batch(BATCH_SIZE)
    return data_batch

  # If the data is a valid dataset, we don't need to shuffle it
  elif valid_data:
    print('Creating validation data batches...')
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                               tf.constant(y))) # labels
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

  else:
    print('Creating training data batches...')
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                               tf.constant(y))) # labels
    # Shuffling pathnames and labels before mapping image processor function is faster than shuffling images
    data = data.shuffle(buffer_size=len(x))

    # Create (image, label) tuples (this also turns the images path into a preprocessed image)
    data = data.map(get_image_label)

    # Turn the training data into batches
    data_batch = data.batch(BATCH_SIZE)
  return data_batch

In [None]:
# Create training and validation data batches
train_data = create_data_batches(x_train, y_train)
val_data = create_data_batches(x_val, y_val, valid_data=True)

In [None]:
# Check out the different attributes of our data batches
train_data.element_spec, val_data.element_spec

## Visualize Data Batches
Our data is now in batches, however, these can be a little hard to understand or comprehend, let's visualize them!

In [None]:
# Create a function for viewing images in a data batch
def show_25_images(images, labels):
  """
  Displays a plot of 25 images and their labels from a data batch.
  """
  # Setup the figure
  plt.figure(figsize=(10, 10))
  # Loop through 25 (for displaying 25 images)
  for i in range(25):
    # Create subplots (5 rows, 5 columns)
    ax = plt.subplot(5, 5, i+1)
    # Displaying an image
    plt.imshow(images[i])
    # Add the image label as the title
    plt.title(unique_labels[labels[i].argmax()])
    # Turn the grid lines off
    plt.axis('off')

In [None]:
train_data

In [None]:
train_images, train_labels = next(train_data.as_numpy_iterator())
len(train_images), len(train_labels)

In [None]:
# Now let's visualize the data in a training batch
train_images, train_labels = next(train_data.as_numpy_iterator())
show_25_images(train_images, train_labels)

In [None]:
# Now let's visualize our validation set
val_images, val_labels = next(val_data.as_numpy_iterator())
show_25_images(val_images, val_labels)

## Building a model
Before we build a model, there are a few things we need to define:

* The input shape (our images shape, in the form of Tensors) to our model.
* The output shape (image labels, in the form of Tensors) of our model.
* The URL of the model we want to use.

In [None]:
# Setup input shape to the model
INPUT_SHAPE = [None, IMG_SIZE, IMG_SIZE, 3] # batch, height, width, color channels

# Setup output shape of our model
OUTPUT_SHAPE = len(unique_labels)

# Setup model URL from TensorFlow Hub
MODEL_URL = 'https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/4'

Now we've got our inputs, outputs and model ready to go. Let's put them together into a Keras deep learning model!

Knowing this, let's create a function which:

* Takes the input shape, output shape and the model we've chosen as parameters.
* Defines the layers in a Keras model in a sequential fashion (do this first, then this, then that).
* Compile the model (says it should be evaluated and improved).
* Builds the model (tells the model the input shape it'll be getting).
* Returns the model.

All of these steps can be found here: https://www.tensorflow.org/guide/keras

In [None]:
# Create a function which builds a Keras model
def create_model(input_shape=INPUT_SHAPE, output_shape=OUTPUT_SHAPE, model_url=MODEL_URL):
  print('Building model with: ', MODEL_URL)

  # Setup the model layers
  model = tf.keras.Sequential([
      hub.KerasLayer(MODEL_URL), # Layer 1 (input layer)
      tf.keras.layers.Dense(units=OUTPUT_SHAPE,
                            activation='softmax') # Layer 2 (Output layer)
  ])

  # Compile the model
  model.compile(
      loss=tf.keras.losses.CategoricalCrossentropy(),
      optimizer=tf.keras.optimizers.Adam(),
      metrics=['accuracy']
  )

  # Build the models
  model.build(INPUT_SHAPE)

  return model

In [None]:
model = create_model()
model.summary()

In [None]:
output = np.ones(shape=(1, 1, 1280))
output

## Creating callbacks
Callbacks are helper function model can use during training to do such things as save its progress, check its progress or stop training earlly if a model stops improving.

We'll create two callbacks, one for TensorBoard which helps track our models progress and another for early stopping which prevents our model from training too long.

TensorBoard Callback
To setup a TensorBoard callback, we need to do 3 things:

1. Load the TensorBoard notebook extension ✅
2. Create a TensorBoard callback which is able to save logs to a directory and pass it to our model's fit() function.✅
3. Visualize our models training logs with the %tensorboard magic function (we'll do this after model training).✅

In [None]:
# Load TensorBoard notebook extension
%load_ext tensorboard

In [None]:
import datetime

# Create a function to build a Tensorboard callback
def create_tensorboard_callback():
  # Create a log directory for storing TensorBoard logs
  logdir = os.path.join('/content/drive/MyDrive/Malaria Cell Classification/cell_images/logs',
                        # Make it so that the logs get tracked whenever we run an experiment
                        datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
  return tf.keras.callbacks.TensorBoard(logdir)

## Early stopping callback
Early stopping helps our model from overfitting by stoppping training if a certain evaluation metrics stops improving.

https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping

In [None]:
# Create early stopping callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                  patience=3)

## Training a model (on subset of data)
Our first model is only going to train on 1000 images, to make sure everything is working.

In [None]:
NUM_EPOCHS = 100 #@param {type:'slider', min:10, max:100, step:10}

Let's create a function which trains a model.

* Create a model using create_model()
* Setup a TensorBoard callback using create_tensorboard_callback()
* Call the fit() function on our model passing it the training data, validation data, number of epochs to train for (NUM_EPOCHS) and the callbacks we'd like to use.
*Return the model

In [None]:
# Build a function to train and return a trained model
def train_model():
  """
  Train a given model and returns the trained version.
  """
  # Create a model
  model = create_model()

  # Create new TensorBoard session everytime we train a model
  tensorboard = create_tensorboard_callback()

  # Fit the model to the data passing it the callbacks we created
  model.fit(x=train_data,
            epochs=NUM_EPOCHS,
            validation_data=val_data,
            validation_freq=1,
            callbacks=[tensorboard, early_stopping])
  # Return the fitted model
  return model

In [None]:
# Fit the model to the data
model = train_model()

## Checking the TensorBoard logs
The TensorBoard magic function (`%tensorboard`) will access the logs directory we created earlier and visualize its contents.

In [None]:
%tensorboard --logdir drive/MyDrive/Dog\ vision/logs

## Making and evaluating predictions using a trained model

In [None]:
val_data

In [None]:
# Make predictions on the validation data (not used to train on)
predictions = model.predict(val_data, verbose=1)
predictions

In [None]:
len(predictions[0])

In [None]:
predictions[0]

In [None]:
np.sum(predictions[0])

In [None]:
predictions.shape

In [None]:
len(y_val)

In [None]:
len(unique_labels)

In [None]:
# First predictions
index = 90
print(predictions[index])
print(f'Max value (probability of prediction): {np.max(predictions[index])}')
print(f'Sum: {np.sum(predictions[index])}')
print(f'Max index: {np.argmax(predictions[index])}')
print(f'Predicted label: {unique_labels[np.argmax(predictions[index])]}')

In [None]:
unique_labels[1]

Having the above functionality is great but we want to be able to do it at scale.

And it could be even better if we could see the image the predictions is being made on!

In [None]:
# Turn prediction probabilities into their respective label (easier to understand)
def get_pred_label(prediction_probabilities):
  """
  Turns an array of prediction probabilities into a label.
  """
  return unique_labels[np.argmax(prediction_probabilities)]

# Get a predicted label based on an array of prediction probabilities
pred_label = get_pred_label(predictions[3])
pred_label

In [None]:
val_data

Now since our data is still in a batch dataset, we'll have to unbatchify it to make predictions on the validation images and then compare those predictions to validaton labels(truth labels)

In [None]:
# Create a function to unbatch a batch dataset
def unbatchify(data):
  """
  Takes a batched dataset of (image, label) Tensors and returns separate arrays of images and labels
  """
  images_ = []
  labels_ = []
  # Loop through unbatched data
  for image, label in data.unbatch().as_numpy_iterator():
    images_.append(image)
    labels_.append(unique_labels[np.argmax(label)])
  return images_, labels_

# Unbatchify the validation data
val_images, val_labels = unbatchify(val_data)
val_images[0], val_labels[0]

In [None]:
get_pred_label(val_labels[0])

Let's make some function to make these all a bit more visualize.

We'll create a function which:

* Takes an array of prediction probabilities, an array labels and an array of images and an integers ✅
* Convert the prediction probabilities to a predicted label✅
* Plot the predicted label, its predicted probability, the truth label and the target image on a single plot✅

In [None]:
def plot_pred(prediction_probabilities, labels, images, n=1):
  """
  View the prediction, ground truth and image for sample n
  """
  pred_prob, true_label, image = prediction_probabilities[n], labels[n], images[n]

  # Get the pred label
  pred_label = get_pred_label(pred_prob)

  # Plot image and remove ticks
  plt.imshow(image)
  plt.xticks([])
  plt.yticks([])

  # Change the color of the title depending on if the predictions is right or wrong
  if pred_label == true_label:
    color = 'green'
  else:
    color = 'red'

  # Change plot title to be predicted, probability of prediction and truth label
  plt.title('{} {:2.0f}% {}'.format(pred_label,
                                    np.max(pred_prob)*100,
                                    true_label),
                                    color=color)

In [None]:
plot_pred(prediction_probabilities=predictions,
          labels=val_labels,
          images=val_images,
          n=110)

Now we've got one function to visualize our models top prediction, let's make another to view our model top 10 predictions.

This function will:

* Take an input of prediction probabilities array and a ground truth array and an integer.
* find the prediction using get_pred_label()
* Find the top 10:
  * Prediction probabilities indexes
  * Prediction probabilities values
  * Prediction labels
* Plot the top 10 prediction probability values and labels, coloring the true label green.

In [None]:
def plot_pred_conf(prediction_probabilities, labels, n=1):
  """
  Plus the top 10 highest  prediction confidences along with the truth label for sample n.
  """
  pred_prob, true_label = prediction_probabilities[n], labels[n]

  # Get the predicted label
  pred_label = get_pred_label(pred_prob)

  # Find the top 10 prediction confidence indexes
  top_10_pred_indexes = pred_prob.argsort()[-10:][::-1]
  # Find the top 10 prediction confidence  values
  top_10_pred_values = pred_prob[top_10_pred_indexes]
  # Find the top 10 prediction labels
  top_10_pred_labels = unique_labels[top_10_pred_indexes]

  # Setup plot
  top_plot = plt.bar(np.arange(len(top_10_pred_labels)),
                     top_10_pred_values,
                     color='grey')
  plt.xticks(np.arange(len(top_10_pred_labels)),
             labels=top_10_pred_labels,
             rotation='vertical')

  # Change the color of the true label
  if np.isin(true_label, top_10_pred_labels):
    top_plot[np.argmax(top_10_pred_labels == true_label)].set_color('green')
  else:
    pass

In [None]:
plot_pred_conf(prediction_probabilities=predictions,
               labels=val_labels,
               n=110)

Now we've got some function to help us visualize our predictions and evaluate our model, let's check out a few.

In [None]:
# Let's check out a few predictions and their different values
i_multiplier = 20
num_rows = 5
num_cols = 4
num_images = num_rows*num_cols
plt.figure(figsize=(10*num_cols, 5*num_rows))
for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot_pred(prediction_probabilities=predictions,
            labels=val_labels,
            images=val_images,
            n=i+i_multiplier)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_pred_conf(prediction_probabilities=predictions,
                 labels=val_labels,
                 n=i+i_multiplier)
plt.tight_layout(h_pad=1.0)
plt.show()

### Saving and reloading a trained model

In [None]:
# Create a function to save a model
def save_model(model, suffix=None):
  """
  Saves a given model in a model directory and append a suffix (string)
  """
  # Create a model directory pathname with current time
  modeldir = os.path.join('/content/drive/MyDrive/Malaria Cell Classification/cell_images',
                          datetime.datetime.now().strftime('%Y%m%d-%H%M%s'))
  model_path = modeldir + '-' + suffix + '.h5' # Save format of model
  print(f'Saving model to: {model_path}...')
  model.save(model_path)
  return model_path


# Create a function to load a trained model
def load_model(model_path):
  """
  Loads a saved model from a specified path.
  """
  print(f'Loading saved model from: {model_path}')
  model = tf.keras.models.load_model(model_path,
                                     custom_objects={'KerasLayer': hub.KerasLayer})
  return model

In [None]:
# Save our model trained on 1000 images
save_model(model, suffix='1000-images-mobilenetv2-Adam')

In [None]:
# Load a trained model
loaded_1000_image_model = load_model('/content/drive/MyDrive/Malaria Cell Classification/cell_images/20240515-20191715804372-1000-images-mobilenetv2-Adam.h5')

In [None]:
# Evaluate the pre-saved model
model.evaluate(val_data)

In [None]:
# Evaluating the loaded model
loaded_1000_image_model.evaluate(val_data)

Training Cell Images (on the full data)

In [None]:
len(x), len(y)

In [None]:
len(x_train)

In [None]:
# Create a data batch with a full data set
full_data = create_data_batches(x, y)
full_data

In [None]:
# Create a model for full model
full_model = create_model()

In [None]:
# Create full model callbacks
full_model_tensorboard = create_tensorboard_callback()
# No validation set when training on all the data, so we can't monitor validation accuracy
full_model_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy',
                                                             patience=3)

In [None]:
# Fit the full model to the full data
full_model.fit(x=full_data,
               epochs=NUM_EPOCHS,
               callbacks=[full_model_tensorboard, full_model_early_stopping])

In [None]:
save_model(full_model, suffix='full-image-set-mobilenetv2-Adam')

In [None]:
# Load in the full model
loaded_full_model = load_model('/content/drive/MyDrive/Malaria Cell Classification/cell_images/20240515-20191715804372-1000-images-mobilenetv2-Adam.h5')

## Making predictions on the test dataset
Save the Cell Images from 11th to 3000th from the last from each folder and save them to tests folder

In [None]:
# Function to read and resize last 2990 images from a folder
def read_and_resize_last_2990_images(folder_path, label):
    images = []
    image_ids = []
    labels = []  # Initialize labels list
    filenames = os.listdir(folder_path)
    filenames.sort()  # Sort filenames alphabetically
    last_3000_filenames = filenames[-3000:]  # Select the last 3000 filenames
    selected_filenames = last_3000_filenames[:-10]  # Exclude the last 10 filenames

    for filename in selected_filenames:  # Iterate over the selected filenames
        if filename.endswith('.jpg') or filename.endswith('.png'):  # Check file format
            image_path = os.path.join(folder_path, filename)
            try:
                img = cv2.imread(image_path)  # Read image using OpenCV
                if img is None:
                    print("Failed to read image:", image_path)
                    continue
                img_resized = cv2.resize(img, target_size)  # Resize image to target size
                images.append(img_resized)
                image_ids.append(filename)  # Store image id (filename)
                labels.append(label)  # Add label for each image
            except Exception as e:
                print("Error reading image:", image_path)
                print(e)
    return images, image_ids, labels  # Return both images, image ids, and labels

# Define the target size for resizing images
target_size = (128, 128) # Adjust as needed

# Define the paths to the folders containing images
parasitized_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/Parasitized'
uninfected_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected'

# Read and resize the last 2990 images from both folders
images_parasitized, image_ids_parasitized, labels_parasitized = read_and_resize_last_2990_images(parasitized_path, label='parasitized')
images_uninfected, image_ids_uninfected, labels_uninfected = read_and_resize_last_2990_images(uninfected_path, label='uninfected')

# Combine the images, image ids, and labels from both folders
combined_images = images_parasitized + images_uninfected
combined_image_ids = image_ids_parasitized + image_ids_uninfected
combined_labels = labels_parasitized + labels_uninfected

# Create a dictionary to map image IDs to images
id_to_image = dict(zip(combined_image_ids, combined_images))

# Combine image ids and labels into a DataFrame
df = pd.DataFrame({'id': combined_image_ids, 'image': combined_images, 'label': combined_labels})

# Shuffle the DataFrame
df_shuffled = df.sample(frac=1).reset_index(drop=True)

# Define the output folder path
output_folder = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/tests'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Write the combined images to the output folder
for i, image_data in enumerate(combined_images):
    # Generate the file path for the image
    image_id = combined_image_ids[i]
    image_label = combined_labels[i]
    file_path = os.path.join(output_folder, f"{image_id}")

    # Save the image using OpenCV
    cv2.imwrite(file_path, image_data)

# Print success message
print(f"2990 images from both folders have been successfully saved to folder: {output_folder}")

In [None]:
# Load test image filenames
test_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/tests'
test_filenames = [test_path + fname for fname in os.listdir(test_path)]
test_filenames[:10]

In [None]:
len(test_filenames)

In [None]:
# Create test data batch
test_data = create_data_batches(test_filenames, test_data=True)

In [None]:
test_data

In [None]:
# Make predictions on test data batch using the loaded full model
test_predictions = loaded_full_model.predict(test_data,
                                             verbose=1)

In [None]:
# Save predictions (Numpy array) to csv file (for access later)
np.savetxt('/content/drive/MyDrive/Malaria Cell Classification/cell_images/preds_array.csv', test_predictions, delimiter=',')

In [None]:
# Load predictions (Numpy array) from csv file
test_predictions = np.loadtxt('/content/drive/MyDrive/Malaria Cell Classification/cell_images/preds_array.csv', delimiter=',')

In [None]:
test_predictions[:10]

In [None]:
test_predictions.shape

## Making predictions on custom images
Save the last 10 Cell Images from each folder and save them to my_cell_images_photos folder

In [None]:
# Function to read and resize last 20 images from a folder
def read_and_resize_last_10_images(folder_path, label):
    images = []
    image_ids = []
    labels = []  # Initialize labels list
    filenames = os.listdir(folder_path)
    filenames.sort()  # Sort filenames alphabetically
    last_10_filenames = filenames[-20:]  # Select the last 20 filenames
    for filename in last_10_filenames:  # Iterate over the last 20 filenames
        if filename.endswith('.jpg') or filename.endswith('.png'):  # Check file format
            image_path = os.path.join(folder_path, filename)
            try:
                img = cv2.imread(image_path)  # Read image using OpenCV
                if img is None:
                    print("Failed to read image:", image_path)
                    continue
                img_resized = cv2.resize(img, target_size)  # Resize image to target size
                images.append(img_resized)
                image_ids.append(filename)  # Store image id (filename)
                labels.append(label)  # Add label for each image
            except Exception as e:
                print("Error reading image:", image_path)
                print(e)
    return images, image_ids, labels  # Return both images, image ids, and labels

# Define the target size for resizing images
target_size = (128, 128) # Adjust as needed

# Define the paths to the folders containing images
parasitized_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/Parasitized'
uninfected_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/Uninfected'

# Specify the number of images to use (last 10) from each folder
num_images_per_folder = 10

# Read and resize the last 20 images from both folders
images_parasitized, image_ids_parasitized, labels_parasitized = read_and_resize_last_10_images(parasitized_path, label='parasitized')
images_uninfected, image_ids_uninfected, labels_uninfected = read_and_resize_last_10_images(uninfected_path, label='uninfected')

# Combine the images, image ids, and labels from both folders
combined_images = images_parasitized + images_uninfected
combined_image_ids = image_ids_parasitized + image_ids_uninfected
combined_labels = labels_parasitized + labels_uninfected

# Create a dictionary to map image IDs to images
id_to_image = dict(zip(combined_image_ids, combined_images))

# Combine image ids and labels into a DataFrame
df = pd.DataFrame({'id': combined_image_ids, 'image': combined_images, 'label': combined_labels})

# Shuffle the DataFrame
df_shuffled = df.sample(frac=1).reset_index(drop=True)

# Define the output folder path
output_folder = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/my_cell_images'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Write the combined images to the output folder
for i, image_data in enumerate(combined_images):
    # Generate the file path for the image
    image_id = combined_image_ids[i]
    image_label = combined_labels[i]
    file_path = os.path.join(output_folder, f"{image_id}")

    # Save the image using OpenCV
    cv2.imwrite(file_path, image_data)

# Print success message
print(f"{num_images_per_folder} images from both folders have been successfully saved to folder: {output_folder}")

In [None]:
# Get custom image filepaths
custom_path = '/content/drive/MyDrive/Malaria Cell Classification/cell_images/my_cell_images'
custom_image_paths = [custom_path + fname for fname in os.listdir(custom_path)]

In [None]:
custom_image_paths

In [None]:
# Turn custom images into batch datasets
custom_data = create_data_batches(custom_image_paths, test_data=True)
custom_data

In [None]:
# Make predictions on the custom data
custom_preds = loaded_full_model.predict(custom_data)
custom_preds.shape

In [None]:
# Get custom image prediction labels
custom_pred_labels = [get_pred_label(custom_preds[i]) for i in range(len(custom_preds))]
custom_pred_labels

In [None]:
# Get custom images (Our unbatchify() function won't work since there aren't labels... maybe we could fix this later)
custom_images = []
# Loop through unbatched data
for image in custom_data.unbatch().as_numpy_iterator():
  custom_images.append(image)

In [None]:
# Check custom image predictions
plt.figure(figsize=(10, 10))
for i, image in enumerate(custom_images):
  plt.subplot(1, 4, i+1)
  plt.xticks([])
  plt.yticks([])
  plt.title(custom_pred_labels[i])
  plt.imshow(image)