# Introduction

* **Competition** - Given a set of images of Cassava leaves we need to classify them as one of the four diseases or healthy.
* **Data** - A collection of 21397 labelled images
* **Evaluation** - Classification accuracy

We will take a quick look at the data files and train our model in this notebook. The model can be saved and used for inference in a later notebook.

# Importing necessary libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
ROOT_DIR = '../input/cassava-leaf-disease-classification/'
os.listdir(ROOT_DIR)

import json # to read in the 'label_num_to_disease_map.json' file

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import cv2
import random

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB1
from tensorflow.keras.utils import plot_model

In [None]:
TRAIN_DIR = '../input/cassava-leaf-disease-classification/train_images/'
TEST_DIR = '../input/cassava-leaf-disease-classification/test_images/'

In [None]:
# set seed
def seed_everything(seed = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed = 42)

# Data exploration

In [None]:
train_df = pd.read_csv(ROOT_DIR + 'train.csv')
sample_df = pd.read_csv(ROOT_DIR + 'sample_submission.csv')

In [None]:
print(train_df.shape, sample_df.shape)
display(train_df.head())

In [None]:
f = open(ROOT_DIR + 'label_num_to_disease_map.json')
data = json.load(f)
print(json.dumps(data, indent = 2))

In [None]:
z = train_df.sample(20)
display(z)
images, labels = z['image_id'].tolist(), z['label'].tolist()

# Plotting 20 randomly sampled images with class

In [None]:
plt.figure(figsize = (20,20))
for i in range(20):
    plt.subplot(4,5,i+1)
    img = cv2.imread(TRAIN_DIR + images[i])
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.title(data[str(labels[i])])

# Plotting a count plot and pie chart for class distribution.

In [None]:
sns.set_style('whitegrid')
sns.countplot(x = 'label', data = train_df, palette = 'Pastel1');

In [None]:
pie_df = train_df['label'].value_counts().reset_index()
pie_df.columns = ['label', 'count']
fig = px.pie(pie_df, values = 'count', names = 'label', color_discrete_sequence = px.colors.qualitative.Pastel)
fig.show()

There is a class imbalance problem here. The Cassava Mosaic Disease (CMD) samples are more compared to other classes. We can solve this by either upsampling other class samples or downsampling the CMD samples.

# Split dataset for training and validation
Reserving 20% of data for validation

In [None]:
train_df = train_df.astype({"label": str})

In [None]:
train, test = train_test_split(train_df, test_size = 0.2, random_state = 42)
print(train.shape, test.shape)

# Creating ImageDataGenerator to generate data in batches and perform image augmentation.

In [None]:
IMG_SIZE = 224
size = (IMG_SIZE,IMG_SIZE)

In [None]:
datagen = ImageDataGenerator(
                    rotation_range = 40,
                    width_shift_range = 0.2,
                    height_shift_range = 0.2,
                    shear_range = 0.2,
                    zoom_range = 0.2,
                    horizontal_flip = True,
                    vertical_flip = True,
                    fill_mode = 'nearest'
)

In [None]:
train_generator = datagen.flow_from_dataframe(
                    train,
                    directory = TRAIN_DIR,
                    x_col = "image_id",
                    y_col = "label",
                    target_size = size,
                    class_mode = "sparse",
                    batch_size = 64,
                    shuffle = True,
                    seed = 42,
                    interpolation = "nearest"
)

In [None]:
valid_generator = datagen.flow_from_dataframe(
                    test,
                    directory = TRAIN_DIR,
                    x_col = "image_id",
                    y_col = "label",
                    target_size = size,
                    class_mode = "sparse",
                    batch_size = 64,
                    shuffle = False,
                    seed = 42,
                    interpolation = "nearest"
)

# Model creation and training

In [None]:
NUM_CLASSES = 5

In [None]:
def create_model():
    
    model = models.Sequential()
    # initialize EfficientNetB0 model with input shape as (224,224,3)
    model.add(EfficientNetB0(input_shape = (IMG_SIZE, IMG_SIZE, 3), include_top = False, weights = 'imagenet'))
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Flatten())
    model.add(layers.Dense(512, activation = 'relu'))
    model.add(layers.Dropout(0.25))
    model.add(layers.Dense(NUM_CLASSES, activation = 'softmax'))
    
    return model

In [None]:
model = create_model()

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = Adam(learning_rate = 0.001),
             metrics = ['accuracy'])

In [None]:
# Stop training when the validation loss metric has stopped decreasing for 5 epochs.
early_stopping = EarlyStopping(monitor = 'val_loss',
                               patience = 5,
                               mode = 'min',
                               restore_best_weights = True)

# Save the model with the minimum validation loss
checkpoint = ModelCheckpoint('best_model.hdf5', 
                             monitor = 'val_loss',
                             verbose = 1,
                             mode = 'min', 
                             save_best_only = True)
# reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                              factor = 0.2,
                              patience = 3,
                              min_lr = 0.001,
                              mode = 'min',
                              verbose = 1)

In [None]:
EPOCHS = 20
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size

In [None]:
history = model.fit(train_generator,
                    validation_data = valid_generator,
                    epochs = EPOCHS,
                    steps_per_epoch = STEP_SIZE_TRAIN,
                    validation_steps = STEP_SIZE_VALID,
                    callbacks = [early_stopping, checkpoint, reduce_lr]
                   )

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes = True)

# Model evaluation

In [None]:
model.evaluate_generator(generator = valid_generator, steps = STEP_SIZE_VALID)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'c-', label='Training accuracy')
plt.plot(epochs, val_acc, 'y-', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'c-', label='Training Loss')
plt.plot(epochs, val_loss, 'y-', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

The prediction and submission notebook can be found here : [Inference Notebook](https://www.kaggle.com/lavanyask/cassava-leaf-disease-inference)