# Modelling

## Import

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from keras_preprocessing.image import ImageDataGenerator
from keras import models, layers, losses, optimizers, metrics, callbacks
from keras.models import load_model

import matplotlib.pyplot as plt

#temporary solutions
%run ../scripts/save_utils.py
%run ../scripts/loss_recall_plot.py
%run ../notebooks/loader_modelling.ipynb

## Baseline model

### ImageDataGenerator

Let's initialize data generators. Most importantly, they will rescale vectorized images such that the values are going to be in range 0-1.

In [85]:
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

Now we need to specify the directories in which these images reside. I have decided to keep original resolution of 512x512 pixels. The *batch_size* is relatively small to reduce memory usage.

In [None]:
train_generator = train_datagen.flow_from_dataframe(df_train, '..\\data\\raw\\merged_data\\',
                                                    x_col='image_path', y_col='label',
                                                    target_size=(512, 512), batch_size=8,
                                                    class_mode='categorical', validate_filenames=False)

validation_generator = val_datagen.flow_from_dataframe(df_val, '..\\data\\raw\\merged_data\\',
                                                       x_col='image_path', y_col='label',
                                                       target_size=(512, 512), batch_size=8,
                                                       class_mode='categorical', validate_filenames=False)

test_generator = test_datagen.flow_from_dataframe(df_test, '..\\data\\raw\\merged_data\\',
                                                  x_col='image_path', y_col='label',
                                                  target_size=(512, 512), batch_size=8,
                                                  class_mode='categorical', validate_filenames=False)

Now we initialize a baseline model. Notice that I have used *clear_session* to reset all variables that model might save before each use of the model (i.e. when re-running notebook).

In [87]:
tf.keras.backend.clear_session()

baseline_model = models.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(512, 512, 3)),
    layers.MaxPooling2D(2, 2),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dense(4, activation='softmax')

])

baseline_model.compile(loss=losses.CategoricalCrossentropy(), optimizer=optimizers.Adam(learning_rate=0.001), metrics=[metrics.Recall()])

In [None]:
baseline_model.summary()

We will also make a callback to invoke early stop. It will monitor validation loss, since we want to minimize it as much as possible.

In [None]:
stop_early = callbacks.EarlyStopping(monitor='val_loss', patience=8)
save_data(stop_early, 'callback', '../save_files/callbacks/stop_early.pkl')

We set *steps_per_epoch* to 500 since *batch_size* is set to 8 and we have approx. 4000 samples in the training set. Thus, to cover as much train data as possible, we will need 500 batches of 8 images.

In [None]:
baseline_history = baseline_model.fit(train_generator, steps_per_epoch=500, epochs=30, validation_data=validation_generator, validation_steps=175, callbacks=[stop_early], verbose=1)

It is a good practise to save a model after training to be able to use it whenever we want without the need to retrain it if for some reason we would have lost its parameters.

In [91]:
baseline_model.save('../save_files/models/baseline.h5')

And now we evaluate the model on **test** data:

In [None]:
baseline_train_results = baseline_model.evaluate(train_generator, batch_size=32, return_dict=True)
baseline_test_results = baseline_model.evaluate(test_generator, batch_size=32, return_dict=True)

save_data(baseline_train_results, 'evaluation_data', '../save_files/evaluation/baseline_train_results.pkl')
save_data(baseline_test_results, 'evaluation_data', '../save_files/evaluation/baseline_test_results.pkl')

In [None]:
print('Evaluation results for baseline model:\n')
print('train loss: ', baseline_train_results['loss'])
print('train recall: ', baseline_train_results['recall'])
print()
print('test loss:   ', baseline_test_results['loss'])
print('test recall: ', baseline_test_results['recall'])

We got very good results even for the baseline model.  
  
Let's now visualize its training and evaluation process to see how it behaves.

### Baseline training and validation visualization

In [None]:
baseline_train_log = baseline_history.history
baseline_num_epochs = np.arange(1, len(baseline_train_log['loss'])+1)

save_data(baseline_train_log, 'train_log', '../save_files/train_log/baseline_train_log.pkl')
save_data(baseline_num_epochs, 'epochs', '../save_files/train_log/baseline_num_epochs.pkl')

loss_recall_plot(baseline_train_log, 'Loss and Recall for baseline model during training')

We see that model **overfits** just after a couple of epochs. This may come from the fact that training set is quite small (approx. 4000 images).

## Augmentation

Let's introduce some random augmentation to initial images. This way we will be able to train the model on the more generalized data to hopefully reduce overfitting.

In [None]:
train_aug = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

train_generator_aug = train_aug.flow_from_dataframe(df_train, '..\\data\\raw\\merged_data\\',
                                                            x_col='image_path', y_col='label',
                                                            target_size=(512, 512), batch_size=8,
                                                            class_mode='categorical', validate_filenames=False)

We will use the same architecture of the model as before:

In [24]:
tf.keras.backend.clear_session()

augmented_model = models.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(512, 512, 3)),
    layers.MaxPooling2D(2, 2),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dense(4, activation='softmax')
])

augmented_model.compile(loss=losses.CategoricalCrossentropy(), optimizer=optimizers.Adam(learning_rate=0.001), metrics=[metrics.Recall()])

In [None]:
augmented_model.summary()

In [None]:
augmented_history = augmented_model.fit(train_generator_aug, steps_per_epoch=500, epochs=30,
                                              validation_data=validation_generator, validation_steps=175,
                                              callbacks=[stop_early], verbose=1)

In [27]:
augmented_model.save('../save_files/models//augmented.h5')

Let's now evaluate the model on the same train data as well as test data to compare the results and see if we have overfitting:

In [None]:
augmented_train_results = augmented_model.evaluate(train_generator_aug, batch_size=32, return_dict=True)
augmented_test_results = augmented_model.evaluate(test_generator, batch_size=32, return_dict=True)

save_data(augmented_train_results, 'evaluation_data', '../save_files/evaluation/augmented_train_results.pkl')
save_data(augmented_test_results, 'evaluation_data', '../save_files/evaluation/augmented_test_results.pkl')

In [None]:
print('Evaluation results for model with augmented images input:\n')
print('train loss: ', augmented_train_results['loss'])
print('train recall: ', augmented_train_results['recall'])
print()
print('test loss:   ', augmented_test_results['loss'])
print('test recall: ', augmented_test_results['recall'])

And now we visualize training process to see how does loss and recall correlate:

In [None]:
augmented_train_log = augmented_history.history
augmented_num_epochs = np.arange(1, len(augmented_train_log['loss'])+1)

save_data(augmented_train_log, 'train_log', '../save_files/train_log/augmented_train_log.pkl')
save_data(augmented_num_epochs, 'train_log', '../save_files/train_log/augmented_num_epochs.pkl')

loss_recall_plot(augmented_train_log, 'Loss and Recall for augmented model during training')

We see that by using **augmentation** we were able to get rid of **overfitting** completely. Although we see that *recall* is much smaller than of the previous model.  

Possible solution - additional training.

### Additional training with augmented images

Before additional training, let's save an old model once again (in case it goes wrong):

In [56]:
augmented_model.save('..//save_files//models/augmented_backup.h5')

And continue training:

In [None]:
additional_training_aug_history = augmented_model.fit(train_generator_aug, steps_per_epoch=500, epochs=30,
                                                  validation_data=validation_generator, validation_steps=175,
                                                  callbacks=[stop_early], verbose=1)

In [None]:
additional_augmented_train_log = additional_training_aug_history.history
additional_augmented_num_epochs = np.arange(1, len(baseline_train_log['loss'])+1)

save_data(additional_augmented_train_log, 'train_log', '../save_files/train_log/additional_augmented_train_log.pkl')
save_data(additional_augmented_num_epochs, 'epochs', '../save_files/train_log/additional_augmented_num_epochs.pkl')

loss_recall_plot(additional_augmented_train_log, 'Loss and Recall for augmented model with additional training during training')

Additional training didn't help.  
  
Let's now try another approach:

### Less augmentation

Let's now see what happens if we add a little bit of augmentation.  
  
In addition to that, we will also add one *Dropout* layer:

In [None]:
train_datagen_aug_small = ImageDataGenerator(
    rescale=1./255,
    rotation_range=5,
    width_shift_range=0.05,
    height_shift_range=0.05,
    shear_range=0.05,
    zoom_range=0.05,
    horizontal_flip=False
)

train_generator_aug_small = train_datagen_aug_small.flow_from_dataframe(df_train, '..\\data\\raw\\merged_data\\',
                                                                  x_col='image_path', y_col='label',
                                                                  target_size=(512, 512), batch_size=8,
                                                                  class_mode='categorical', validate_filenames=False)

In [13]:
tf.keras.backend.clear_session()

dropout_augmented_model = models.Sequential([

        layers.Conv2D(64, (3, 3), activation='relu', input_shape=(512, 512, 3)),
        layers.MaxPooling2D(2, 2),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D(2, 2),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D(2, 2),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu'),
        layers.Dense(4, activation='softmax')
])

dropout_augmented_model.compile(loss=losses.CategoricalCrossentropy(), optimizer=optimizers.Adam(learning_rate=0.001), metrics=[metrics.Recall()])

In [None]:
dropout_augmented_model.summary()

In [None]:
dropout_augmented_history_train = dropout_augmented_model.fit(train_generator_aug_small, steps_per_epoch=500, epochs=60,
                                                      validation_data=validation_generator, validation_steps=175,
                                                      callbacks=[stop_early], verbose=1)