# Solanum identifier using Neural Networks

Contains the code to balance classes and create the training/val/test sets

# Environment preparations

## Dependencies

In [None]:
!pip install -U keras tensorflow matplotlib numpy pandas imblearn split-folders tensorflow-addons
!pip install -q "tqdm>=4.36.1"

## Env vars

In [1]:
# Set this variable to the root path of where the files are located
%env DATA_LOCATION=P:\CODE\ITESM\tesis-dataset-downloader\solanum_output
%env CLEAN_DATA_FOLDER=z_clean_resized
%env CLEAN_SPLIT_DATA_FOLDER=z_clean_resized_split

#%env DATA_LOCATION=/workspace/jupyter_workspace/tesis
#%env DATA_LOCATION=/content/drive/MyDrive/Datasets/AndroidOrIos
!echo %DATA_LOCATION%

#!dir %DATA_LOCATION%

# Constants
import os

RANDOM_SEED = 1988
DATA_ROOT_LOCATION = os.environ["DATA_LOCATION"]
CLEAN_DATA_FOLDER = os.environ["CLEAN_DATA_FOLDER"]
CLEAN_SPLIT_DATA_FOLDER = os.environ["CLEAN_SPLIT_DATA_FOLDER"]

LABELS = [
            "petota",
            "holophylla",
            "melongena",
            "torva",
            "brevantherum",
            "solanum",
            "dulcamara",
            "herposolanum",
            "micracantha",
            "lasiocarpa",
            "acanthophora",
            "anarrhichomenum",
        ]

env: DATA_LOCATION=P:\CODE\ITESM\tesis-dataset-downloader\solanum_output
env: CLEAN_DATA_FOLDER=z_clean_resized
env: CLEAN_SPLIT_DATA_FOLDER=z_clean_resized_split
P:\CODE\ITESM\tesis-dataset-downloader\solanum_output


## Imports

In [2]:
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.utils import img_to_array

from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import tensorflow_addons as tfa

# tqdm Progress Bar
import tqdm

# quietly deep-reload tqdm
#import sys
#from IPython.lib import deepreload 

#stdout = sys.stdout
#sys.stdout = open('junk','w')
#deepreload.reload(tqdm)
#sys.stdout = stdout
# As shown in https://www.tensorflow.org/addons/tutorials/tqdm_progress_bar

import matplotlib
import matplotlib.pyplot as plt

import numpy as np
from numpy import array

import pandas as pd

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import splitfolders

#from sklearn.metrics import classification_report, confusion_matrix
#import sklearn.model_selection as model_selection

#from imutils import paths
#import shutil

import os
import time
#import random

# Enable 3rd Party Jupyter Widgets in Google Collab
#from google.colab import output
#output.enable_custom_widget_manager()

def get_training_device_name():
    name = tf.test.gpu_device_name()

    if "GPU" not in name:
        print("No GPU was found!, training will be done in the CPU which will be slower")
        name = '/cpu:0'
    else:
        print('Found GPU at: {}'.format(name))
    
    return name

TRAINING_DEVICE_NAME = get_training_device_name()

Found GPU at: /device:GPU:0


## Files description

In [None]:
images_df = pd.read_csv(os.path.join(DATA_ROOT_LOCATION, CLEAN_DATA_FOLDER, "images_dedup_512x512_100picspersection.csv"))

# Update all paths to conform to the local structure (Only if running in a UN*X environment)

# 'P:/CODE/ITESM/tesis-dataset-downloader/solanum_output/z_clean_resized/acanthophora/acanthophora_acerifolium_1928496814_gbif_2700.jpg'
# path_to_replace = "P:/CODE/ITESM/tesis-dataset-downloader/solanum_output/z_clean_resized"
# images_df["full_path"] = images_df["full_path"].str.replace(path_to_replace, data_root_location)

display(images_df.describe(include="all"))

## Tools

In [3]:
def print_marquee(msg: str):
    """
    Prints a centered message with a marquee of * 
    """
    marquee_width = len(msg) + 4
    print("\n")
    print("*" * marquee_width)
    print(f"* {msg} *")
    print("*" * marquee_width)

def model_2_pkl(model, filename: str):
    """Exports a model to PKL format"""
    pickle.dump(model, open(filename, 'wb'))

def pkl_2_model(filename: str):
    """Loads a model from a PKL file"""
    return pickle.load(open(filename, 'rb'))

def train_model(model_to_fit: tf.keras.models.Model,
                fit_params: dict):

    t = time.process_time()
    
    trained_model = model_to_fit.fit(**fit_params)
    
    elapsed_time = time.process_time() - t
    
    print(f"\n\n ********* Training time: {elapsed_time} s.")
    return trained_model

def graph_loss_accuracy(h_model,
                        subtitle: str = ""):
    
    num_records = len(h_model.history["accuracy"])
    
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(np.arange(0,num_records),h_model.history["accuracy"],label="train_acc")
    plt.plot(np.arange(0,num_records),h_model.history["val_accuracy"],label="val_acc")
    plt.title("Training and Validation Accuracy" 
              + f" ({subtitle})" if subtitle else "")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    plt.figure()
    plt.plot(np.arange(0,num_records),h_model.history["loss"],label="train_loss") 
    plt.plot(np.arange(0,num_records),h_model.history["val_loss"],label="val_loss")
    plt.title("Training and Validation Loss" 
              + f" ({subtitle})" if subtitle else "")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()

# Helper functions for reporting

def print_dataset_prediction_report(y_pred,
                                    y_real,
                                    labels: list=None):
    
    y_pred = np.round(y_pred).astype(int)

    print_marquee("Classification Report")
    print(classification_report(y_real,
                                y_pred,
                                target_names=labels))
    
    print_marquee("Confusion Matrix")
    print(confusion_matrix(y_real,
                           y_pred))


def evaluate_model(model_to_fit: tf.keras.models.Model,
                   fit_params: dict,
                   labels: list = None) -> tf.keras.models.Model:

    trained_model_history = train_model(
                                        model_to_fit,
                                        fit_params,
                                       )
    print_marquee("Model Summary")                                
    model_to_fit.summary()

    print_marquee("Validation Dataset Confusion Matrix")

    val_model_predictions = model_to_fit.predict(fit_params["validation_data"],
                                                 fit_params["batch_size"])
    
    print_dataset_prediction_report(val_model_predictions,
                                    fit_params["validation_data"][1],
                                    labels)

    print_marquee("Train/Val Accuracy and Loss graphs")

    # If using early stopping, it might be the case that we used less epochs than
    #  requested
    subtitle = f"Epochs: {len(trained_model_history.history['accuracy'])}"
    
    graph_loss_accuracy(trained_model_history,
                        subtitle=subtitle)
    
    return model_to_fit, trained_model_history

def create_model_checkpoint(filepath: str) -> tf.keras.callbacks.ModelCheckpoint:
    return tf.keras.callbacks.ModelCheckpoint(
            filepath,
            monitor = 'val_loss',
            verbose = 0,
            save_best_only = True,
            save_weights_only = False,
            mode = 'auto',
            save_freq='epoch',
            options=None,
            initial_value_threshold=None,
        )

# Data preparation

## Data at a glance

In [None]:
print_marquee("Classes")
count_per_section = images_df.groupby(["section"]).size().reset_index(name='count').sort_values("count", ascending=False)
display(count_per_section)

display(count_per_section.describe())

print_marquee("Info")
display(images_df.info())

## Balancing of classes TBD

We can see that there are classes that are overrepresented, so we need to do some undersampling and oversampling in order for the model to better learn and classify.

The average count for the samples is `709.58` while the median is `483` (listed as the 2nd quartile or 50%), so the best strategy that will be employed is to randomly undersample anything above `500` and oversample anything below that level.

In [None]:
# https://imbalanced-learn.org/stable/
undersampler = RandomUnderSampler(sampling_strategy='all',
                                  random_state=RANDOM_SEED)

oversampler = RandomOverSampler(sampling_strategy='auto',
                                  random_state=RANDOM_SEED)

X_train_res, y_train_res = oversampler.fit_resample(X_train,
                                                    y_train)

## Train/Val/Test split

For this part we'll use 70/20/10 split for the train, validation and tests datasets.



In [None]:
%%time
#splitfolders.ratio(os.path.join(DATA_ROOT_LOCATION, CLEAN_DATA_FOLDER), # The location of dataset
#                   output=os.path.join(DATA_ROOT_LOCATION, CLEAN_SPLIT_DATA_FOLDER), # The output location
#                   seed=RANDOM_SEED, # The number of seed
#                   ratio=(.7, .2, .1), # The ratio of splited dataset
#                   group_prefix=None, # If your dataset contains more than one file like ".jpg", ".pdf", etc
#                   move=False # If you choose to move, turn this into True
#                   )

## Data augmentation

In [21]:
TARGET_SIZE = (64, 64)
BATCH_SIZE = 8
COLOR_MODE = "rgb"

def get_image_data_gen_params():
    return {
        "target_size": TARGET_SIZE,
        "batch_size": BATCH_SIZE,
        "color_mode": COLOR_MODE,
        "class_mode": "categorical",
        "seed": RANDOM_SEED,
        "save_prefix": 'augmented_',
        "save_format": 'png'
    }


train_datagen = ImageDataGenerator(
            rescale=1.0/255, # Normalize the data to be 0-1
            rotation_range=40,
            width_shift_range=0.2,
            height_shift_range=0.2,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            fill_mode='nearest',
            preprocessing_function=tf.keras.applications.vgg16.preprocess_input)

test_datagen = ImageDataGenerator(
    rescale=1.0/255
)

train_generator = train_datagen.flow_from_directory(
        os.path.join(DATA_ROOT_LOCATION, CLEAN_SPLIT_DATA_FOLDER, "train"),
        save_to_dir=os.path.join(DATA_ROOT_LOCATION, CLEAN_SPLIT_DATA_FOLDER, "train"),
        **get_image_data_gen_params())

validation_generator = test_datagen.flow_from_directory(
        os.path.join(DATA_ROOT_LOCATION, CLEAN_SPLIT_DATA_FOLDER, "val"),
        save_to_dir=os.path.join(DATA_ROOT_LOCATION, CLEAN_SPLIT_DATA_FOLDER, "val"),
        **get_image_data_gen_params())

Found 5955 images belonging to 12 classes.
Found 1698 images belonging to 12 classes.


# Models

## Manual VGG8 TBD

## Manual VGG16 TBD

In [None]:
|

## VGG8 TBD

## VGG16 TBD

In [None]:
%%time

vgg16_tf_model = tf.keras.applications.vgg16.VGG16(
    include_top=True,
    weights=None,
    classes=12,
    classifier_activation='softmax'
)

# Add an optimizer
vgg16_tf_model.compile(optimizer="adam",
                        loss='categorical_crossentropy',
                        metrics=['accuracy'])

# Add a progress bar and save checkpoints
vgg16_callbacks = [
    create_model_checkpoint(os.path.join(DATA_ROOT_LOCATION, "vgg16")),
    tfa.callbacks.TQDMProgressBar()
    #tf.keras.callbacks.ProgbarLogger(
    #    count_mode = 'steps',
    #    stateful_metrics = None
    #)
]


with tf.device(TRAINING_DEVICE_NAME):
    evaluate_model(vgg16_tf_model,
                   fit_params = {
                            "x": train_generator,
                            "batch_size": BATCH_SIZE,
                            "epochs": 10,
                            "callbacks": vgg16_callbacks,
                            "validation_data": validation_generator,
                            "steps_per_epoch": 64,
                            "validation_freq": 1,
                            "max_queue_size": 1,
                            "workers": 1,
                            "use_multiprocessing": False
                   },
                   labels = LABELS)

## ResNET50 TBD

In [10]:
25088/256

98.0

## State of the art TBD