# For Training and Loading the Pretrained Model on a Fresh Dataset

In [None]:
import pandas as pd
import numpy as np
import os
from PIL import Image
from sklearn.model_selection import train_test_split
from matplotlib.pyplot import imread
from keras.layers import Input
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
print("TF version:", tf.__version__)
print("Hub version:", hub.__version__)

# Check for GPU
print("GPU", "available" if tf.config.list_physical_devices("GPU") else "not available")

TF version: 2.8.2
Hub version: 0.12.0
GPU available


In [None]:
train_labels_csv = pd.read_csv("drive/MyDrive/Datathon/train_labels.csv")

In [None]:
labels = train_labels_csv["label"].to_numpy() # convert labels column to NumPy array (from Training Dataset)
# Finding the unique labels
unique_labels = np.unique(labels)
# Turn every label into a boolean array
boolean_labels = [label == np.array(unique_labels) for label in labels]


In [None]:
# Create pathnames from image ID's
train_path = "drive/MyDrive/Datathon/train/train/"
filenames = [train_path + str(fname) + ".jpeg" for fname in train_labels_csv["id"]]      # Fetching training files' IDs from train_labels_csv

val_path = "drive/MyDrive/Datathon/validation/validation/"
val_filenames = [val_path + str(fname) for fname in os.listdir(val_path)]       # Fetching Validation files' IDs from the validation set

In [None]:
# Setup X & y variables
X = filenames
y = boolean_labels

In [None]:
# Define image size
IMG_SIZE = 224

def process_image(image_path):
  """
  Takes an image file path and turns it into a Tensor.
  """
  # Read in image file
  image = tf.io.read_file(image_path)
  # Turn the jpeg image into numerical Tensor with 3 colour channels (Red, Green, Blue)
  image = tf.image.decode_jpeg(image, channels=3)
  # Convert the colour channel values from 0-225 values to 0-1 values
  image = tf.image.convert_image_dtype(image, tf.float32)
  # Resize the image to our desired size (224, 244)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])
  return image

In [None]:
# Create a simple function to return a tuple (image, label)
def get_image_label(image_path, label):
  """
  Takes an image file path name and the associated label,
  processes the image and returns a tuple of (image, label).
  """
  image = process_image(image_path)
  return image, label

In [None]:
# Define the batch size, 32 is a good default
BATCH_SIZE = 32

# Create a function to turn data into batches
def create_data_batches(x, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
  """
  Creates batches of data out of image (x) and label (y) pairs.
  Shuffles the data if it's training data but doesn't shuffle it if it's validation data.
  Also accepts test data as input (no labels).
  """
  # If the data is a test dataset, we probably don't have labels
  if test_data:
    print("Creating test data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x))) # only filepaths
    data_batch = data.map(process_image).batch(BATCH_SIZE)
    return data_batch
  
  # If the data if a valid dataset, we don't need to shuffle it
  elif valid_data:
    print("Creating validation data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                               tf.constant(y))) # labels
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

  else:
    # If the data is a training dataset, we shuffle it
    print("Creating training data batches...")
    # Turn filepaths and labels into Tensors
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                              tf.constant(y))) # labels
    

    # Create (image, label) tuples (this also turns the image path into a preprocessed image)
    data = data.map(get_image_label)

    # Turn the data into batches
    data_batch = data.batch(BATCH_SIZE)
  return data_batch

In [None]:
# Turn full training data in a data batch
full_data = create_data_batches(X, y)

Creating training data batches...


In [None]:
# Setup input shape to the model
INPUT_SHAPE = [None, IMG_SIZE, IMG_SIZE, 3] # batch, height, width, colour channels

# Setup output shape of the model
OUTPUT_SHAPE = len(unique_labels) # number of unique labels

# Setup model URL from TensorFlow Hub
MODEL_URL = "https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/5"

In [None]:
# we will build the model using the Keras API

def create_model(input_shape=INPUT_SHAPE, output_shape=OUTPUT_SHAPE, model_url=MODEL_URL):
  print("Building the model with:", MODEL_URL)

  # Setup the model layers
  model = tf.keras.Sequential([
    hub.KerasLayer(MODEL_URL), # Layer 1 (input layer)
    tf.keras.layers.Dense(units=OUTPUT_SHAPE, 
                          activation="softmax") # Layer 2 (output layer). Softmax will predict the probabilities for each class for each image
  ])

  # Compile the model
  model.compile(
      loss=tf.keras.losses.CategoricalCrossentropy(), # Our model wants to reduce this (how wrong its guesses are)
      optimizer=tf.keras.optimizers.Adam(), # An optimizer helping our model how to improve its guesses
      metrics=["accuracy"] # We'd like this to go up
  )

  # Build the model
  model.build(INPUT_SHAPE) # Let the model know what kind of inputs it'll be getting
  
  return model

## Creating the Model 2 for Full data Training

In [None]:
# Instantiate a new model for training on the full dataset
full_model2 = create_model()
full_model2.summary()

Building the model with: https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification/4
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_2 (KerasLayer)  (None, 1001)              3540265   
                                                                 
 dense_2 (Dense)             (None, 16)                16032     
                                                                 
Total params: 3,556,297
Trainable params: 16,032
Non-trainable params: 3,540,265
_________________________________________________________________


In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

import datetime

# Create a function to build a TensorBoard callback
def create_tensorboard_callback():
  # Create a log directory for storing TensorBoard logs
  logdir = os.path.join("drive/MyDrive/Datathon/logs",
                        # Make it so the logs get tracked whenever we run an experiment
                        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  return tf.keras.callbacks.TensorBoard(logdir)

In [None]:
# Create full model callbacks

# TensorBoard callback
full_model_tensorboard = create_tensorboard_callback()

# Early stopping callback
# Note: No validation set when training on all the data, so we monitor only training accuracy
full_model_early_stopping = tf.keras.callbacks.EarlyStopping(monitor="accuracy",
                                                             patience=2)

In [None]:
def save_model(model, suffix=None):
  """
  Saves a given model in a models directory and appends a suffix (str)
  for clarity and reuse.
  """
  # Create model directory with current time
  modeldir = os.path.join("drive/MyDrive/Datathon/models",
                          datetime.datetime.now().strftime("%Y%m%d-%H%M%s"))
  model_path = modeldir + "-" + suffix + ".h5" # save format of model
  print(f"Saving model to: {model_path}...")
  model.save(model_path)
  return model_path

In [None]:
def load_model(model_path):
  """
  Loads a saved model from a specified path.
  """
  print(f"Loading saved model from: {model_path}")
  model = tf.keras.models.load_model(model_path,
                                     custom_objects={"KerasLayer":hub.KerasLayer})
  return model

In [None]:
# Fit the full model to the full training data
full_model2.fit(x=full_data,
               epochs=NUM_EPOCHS,
               callbacks=[full_model_tensorboard, 
                          full_model_early_stopping])

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x7fea1622a990>

In [None]:
# Save our model trained on 4000 images from the Training Dataset
save_model(full_model2, suffix="full-model-2-Adam")

Saving model to: drive/MyDrive/Datathon/models/20221008-14551665240924-full-model-2-Adam.h5...


'drive/MyDrive/Datathon/models/20221008-14551665240924-full-model-2-Adam.h5'

In [None]:
# Load our model trained on 1000 images
loaded_model = load_model('drive/MyDrive/Datathon/models/20221008-14551665240924-full-model-2-Adam.h5')

Loading saved model from: drive/MyDrive/Datathon/models/20221008-14551665240924-full-model-2-Adam.h5


In [None]:
X_val = X[:500]
y_val = y[:500]
val_data = create_data_batches(X_val, y_val, valid_data=True)

Creating validation data batches...


In [None]:
# Evaluate the loaded model
loaded_model.evaluate(val_data)



[0.9254278540611267, 0.722000002861023]

In [None]:
# Turn prediction probabilities into their labels (Document Types)
def get_pred_label(prediction_probabilities):
  """
  Turns an array of prediction probabilities into a label.
  """
  return unique_labels[np.argmax(prediction_probabilities)]


In [None]:
model_path = "drive/MyDrive/Datathon/models/20221007-08281665131319-full-trained-adam.h5" 
data_path = "drive/MyDrive/Datathon/validation/validation"

In [None]:
# Function to test the validation data stored in 'data_path' with the model stored in 'model_path'
# here, model_path = "drive/MyDrive/Datathon/models/20221007-08281665131319-full-trained-adam.h5" 
#       data_path = "drive/MyDrive/Datathon/validation/validation"

def test(model_path, data_path):
  # Load the fully trained model
  loaded_full_model = load_model(model_path)

  # Load validation image filenames
  val_path = data_path
  val_filenames = [val_path + fname for fname in os.listdir(val_path)]

  # Getting the list of validation set IDs
  val_id = [id for id in os.listdir(val_path)]
  val_ids = []
  for item in val_id:
    val_ids.append(int(item.split(".")[0]))
  
  # Create validation data batch so as to turn it into tensors and then fit it in our model
  val_data = create_data_batches(val_filenames, test_data=True) 

  # Make predictions on the validation data 
  predictions = loaded_full_model.predict(val_data, verbose=1) 
  
  # Getting the predicted labels in array val_pred_labels[]
  val_pred_labels = []
  for i in range(len(val_ids)):
    val_pred_labels.append(get_pred_label(predictions[i]))
  
  # Fitting the data into Pandas dataframe
  data = []
  for i in range(len(val_ids)):
    data.append((val_ids[i], val_pred_labels[i]))
  df = pd.DataFrame(data, columns=['id','label'])

  # Saving the predicted labels on validation set images in CSV
  # Saving the predictions to predicted_label.csv file and saving it inside the datathon folder in GDrive
  # df.to_csv(r'drive/MyDrive/Datathon/predicted_label2.csv', index=False) 
  df.to_csv(r'drive/MyDrive/Datathon/predicted_label.csv', index=False)  

In [None]:
test(model_path, data_path)

In [None]:
data= pd.read_csv('drive/MyDrive/Datathon/predicted_label.csv')
data

Unnamed: 0,id,label
0,17801,4
1,17802,6
2,17803,4
3,17804,8
4,17805,3
...,...,...
895,18696,1
896,18697,12
897,18698,8
898,18699,14
