# For Loading and Testing the Pretrained Model on a Fresh Dataset

In [None]:
# The '-d' parameter shows the destination for where the files should go
!unzip "drive/MyDrive/Datathon/datathonindoml-2022.zip" -d "drive/MyDrive/Datathon/"

In [None]:
import pandas as pd
import numpy as np
import os
from PIL import Image
from sklearn.model_selection import train_test_split
from matplotlib.pyplot import imread
from keras.layers import Input
import tensorflow as tf
import tensorflow_hub as hub

## Accessing the Data

In [None]:
train_labels_csv = pd.read_csv("drive/MyDrive/Datathon/train_labels.csv")

# convert labels column to NumPy array (from Training Dataset)
labels = train_labels_csv["label"].to_numpy() 
# Finding the unique labels
unique_labels = np.unique(labels)
# Turn every label into a boolean array
boolean_labels = [label == np.array(unique_labels) for label in labels]


In [None]:
# Create pathnames from image ID's
train_path = "drive/MyDrive/Datathon/train/train/"
filenames = [train_path + str(fname) + ".tif" for fname in train_labels_csv["id"]]      # Fetching training files' IDs from train_labels_csv

val_path = "drive/MyDrive/Datathon/validation/validation/"
val_filenames = [val_path + str(fname) for fname in os.listdir(val_path)]       # Fetching Validation files' IDs from the validation set

## Preprocessing the Images (Turning images into Tensors)

Running the next cell may take a significant amount of time

In [None]:
# Renaming the training filenames from (.tif) to (.jpg) in order to convert them into tensors later on

for i in range(len(filenames)):
  os.rename(filenames[i], filenames[i].split(".")[0] + '.jpeg') 

# Renaming the validation filenames from (.tif) to (.jpg) in order to convert them into tensors later on

for i in range(len(val_filenames)):
  os.rename(val_filenames[i], val_filenames[i].split(".")[0] + '.jpeg')     #rename the files (.tif) into (.jpeg)

NOTE: Running the next cell may take a significant amount of time

In [None]:
# Create pathnames from image ID's
train_path = "drive/MyDrive/Datathon/train/train/"
filenames = [train_path + str(fname) for fname in os.listdir(train_path)]     # Fetching training files' IDs from train_labels_csv

# Converting all images into RGB Format in order to turn it into tensors

for infile in filenames:
    outfile = infile
    im = Image.open(infile)
    out = im.convert("RGB")
    out.save(outfile, "JPEG", quality=90)

NOTE: Running the next cell may take a significant amount of time

In [None]:
val_path = "drive/MyDrive/Datathon/validation/validation/"
val_filenames = [val_path + str(fname) for fname in os.listdir(val_path)]

# Converting all images into RGB Format in order to turn it into tensors
for infile in val_filenames:
    outfile = infile
    im = Image.open(infile)
    out = im.convert("RGB")
    out.save(outfile, "JPEG", quality=90)

## Testing the Presaved Model on the converted dataset

In [None]:
# Define image size
IMG_SIZE = 224

def process_image(image_path):
  """
  Takes an image file path and turns it into a Tensor.
  """
  # Read in image file
  image = tf.io.read_file(image_path)
  # Turn the jpeg image into numerical Tensor with 3 colour channels (Red, Green, Blue)
  image = tf.image.decode_jpeg(image, channels=3)
  # Convert the colour channel values from 0-225 values to 0-1 values
  image = tf.image.convert_image_dtype(image, tf.float32)
  # Resize the image to our desired size (224, 244)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])
  return image

In [None]:
# Create a simple function to return a tuple (image, label)
def get_image_label(image_path, label):
  """
  Takes an image file path name and the associated label,
  processes the image and returns a tuple of (image, label).
  """
  image = process_image(image_path)
  return image, label

In [None]:
# Turn prediction probabilities into their labels (Document Types)
def get_pred_label(prediction_probabilities):
  """
  Turns an array of prediction probabilities into a label.
  """
  return unique_labels[np.argmax(prediction_probabilities)]

In [None]:
# Define the batch size, 32 is a good default
BATCH_SIZE = 32

# Create a function to turn data into batches
def create_data_batches(x, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
  """
  Creates batches of data out of image (x) and label (y) pairs.
  Shuffles the data if it's training data but doesn't shuffle it if it's validation data.
  Also accepts test data as input (no labels).
  """
  # If the data is a test dataset, we probably don't have labels
  if test_data:
    print("Creating test data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x))) # only filepaths
    data_batch = data.map(process_image).batch(BATCH_SIZE)
    return data_batch
  
  # If the data if a valid dataset, we don't need to shuffle it
  elif valid_data:
    print("Creating validation data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                               tf.constant(y))) # labels
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

  else:
    # If the data is a training dataset, we shuffle it
    print("Creating training data batches...")
    # Turn filepaths and labels into Tensors
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                              tf.constant(y))) # labels
    

    # Create (image, label) tuples (this also turns the image path into a preprocessed image)
    data = data.map(get_image_label)

    # Turn the data into batches
    data_batch = data.batch(BATCH_SIZE)
  return data_batch

In [None]:
# Setup input shape to the model
INPUT_SHAPE = [None, IMG_SIZE, IMG_SIZE, 3] # batch, height, width, colour channels

# Setup output shape of the model
OUTPUT_SHAPE = len(unique_labels) # number of unique labels

# Setup model URL from TensorFlow Hub
MODEL_URL = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification/4"
MODEL_URL_2 = "https://tfhub.dev/google/imagenet/mobilenet_v1_100_224/classification/5"
MODEL_URL_3 = "https://tfhub.dev/google/imagenet/mobilenet_v1_025_224/classification/5"
MODEL_URL_4 = "https://tfhub.dev/google/imagenet/mobilenet_v2_075_224/classification/5"

In [None]:
def load_model(model_path):
  """
  Loads a saved model from a specified path.
  """
  print(f"Loading saved model from: {model_path}")
  model = tf.keras.models.load_model(model_path,
                                     custom_objects={"KerasLayer":hub.KerasLayer})
  return model

## Final Testing using test() method as mentioned in the Datathon Submission Guidelines

In [None]:
model_path = "drive/MyDrive/Datathon/models/20221007-08281665131319-full-trained-adam.h5" 
data_path = "drive/MyDrive/Datathon/validation/validation/"

In [None]:
# Function to test the validation data stored in 'data_path' with the model stored in 'model_path'
# here, model_path = "drive/MyDrive/Datathon/models/20221007-08281665131319-full-trained-adam.h5" 
#       data_path = "drive/MyDrive/Datathon/validation/validation"

def test(model_path, data_path):
  # Load the fully trained model
  loaded_full_model = load_model(model_path)

  # Load validation image filenames
  val_path = data_path
  val_filenames = [val_path + fname for fname in os.listdir(val_path)]

  # Getting the list of validation set IDs
  val_id = [id for id in os.listdir(val_path)]
  val_ids = []
  for item in val_id:
    val_ids.append(int(item.split(".")[0]))
  
  # Create validation data batch so as to turn it into tensors and then fit it in our model
  val_data = create_data_batches(val_filenames, test_data=True) 

  # Make predictions on the validation data 
  predictions = loaded_full_model.predict(val_data, verbose=1) 
  
  # Getting the predicted labels in array val_pred_labels[]
  val_pred_labels = []
  for i in range(len(val_ids)):
    val_pred_labels.append(get_pred_label(predictions[i]))
  
  # Fitting the data into Pandas dataframe
  data = []
  for i in range(len(val_ids)):
    data.append((val_ids[i], val_pred_labels[i]))
  df = pd.DataFrame(data, columns=['id','label'])

  # Saving the predicted labels on validation set images in CSV
  # Saving the predictions to predicted_label.csv file and saving it inside the datathon folder in GDrive
  # df.to_csv(r'drive/MyDrive/Datathon/predicted_label2.csv', index=False) 
  df.to_csv(r'drive/MyDrive/Datathon/predicted_label.csv', index=False)  

In [None]:
test(model_path, data_path)

Loading saved model from: drive/MyDrive/Datathon/models/20221007-08281665131319-full-trained-adam.h5
Creating test data batches...
