In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
# Define image dimensions and paths
IMG_WIDTH = 224
IMG_HEIGHT = 224
BATCH_SIZE = 32
DATA_DIR = 'C:/Users/fauzi/PycharmProjects/Corn Leaf Disease/dataset/train'

# --- Data Processing Framework ---
# We will split the data from the train directory into training and validation sets.
# Data Augmentation and Rescaling for Training Data, also specifying the validation split.
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2  # Splitting 20% of the data for validation
)

# Flow training images in batches from directory
train_generator = train_datagen.flow_from_directory(
    DATA_DIR,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training'  # Specify this is the training set
)

# Flow validation images in batches from directory
validation_generator = train_datagen.flow_from_directory(
    DATA_DIR,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation',  # Specify this is the validation set
    shuffle=False # Important for evaluation
)

# Print class indices to verify
print("Class indices:", train_generator.class_indices)
NUM_CLASSES = len(train_generator.class_indices)


Found 4328 images belonging to 3 classes.
Found 1080 images belonging to 3 classes.
Class indices: {'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot': 0, 'Corn_(maize)___Common_rust_': 1, 'Corn_(maize)___healthy': 2}


### Explanation of the Data Preprocessing Method

The method used for data preprocessing is a standard and highly effective approach for image classification tasks, especially when working with deep learning models like Convolutional Neural Networks (CNNs). Here's a breakdown of why this method is so effective:

**1. `ImageDataGenerator`:**

*   **Efficiency:** Instead of loading all the images into memory at once (which can be very memory-intensive, especially with large datasets), `ImageDataGenerator` creates a Python generator that loads the images in batches. This is far more memory-efficient and is a common practice in deep learning.
*   **On-the-Fly Augmentation:** The generator applies data augmentation transformations to the images as they are being loaded. This means that the model sees slightly different versions of the same image in each epoch, which helps to improve the model's ability to generalize.
*   **Validation Split:** The `validation_split` argument is a convenient way to automatically reserve a portion of your training data for validation. This is crucial for monitoring the model's performance on unseen data during training.

**2. Data Augmentation:**

*   **Reduces Overfitting:** Data augmentation artificially expands the training dataset by creating modified versions of the images. This helps to prevent the model from "memorizing" the training data and improves its ability to generalize to new, unseen images.

**3. `flow_from_directory`:**

*   **Convenience:** This function is incredibly convenient. As long as your directory structure is set up correctly (with separate subdirectories for each class), `flow_from_directory` will automatically infer the class labels from the directory names.
*   **Batching and Resizing:** It handles the batching of the data (controlled by `batch_size`) and resizes the images to the desired `target_size` on the fly.

In summary, this data preprocessing pipeline is a robust and efficient way to prepare image data for training a deep learning model. It handles memory management, data augmentation, and the creation of data batches, all of which are essential for successful model training.


In [3]:
# --- VGG-16 Model Architecture ---

# Load the VGG-16 model, pre-trained on ImageNet, without the top classification layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_WIDTH, IMG_HEIGHT, 3))

# Freeze the convolutional base
for layer in base_model.layers:
    layer.trainable = False

# Create a new model on top
vgg_model = Sequential([
    base_model,
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(NUM_CLASSES, activation='softmax')
])

vgg_model.summary()

# Compile the model
vgg_model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])


In [4]:
# --- Train the Model ---
history = vgg_model.fit(
    train_generator,
    steps_per_epoch=np.ceil(train_generator.samples / BATCH_SIZE),
    epochs=10, # You can increase the number of epochs
    validation_data=validation_generator,
    validation_steps=np.ceil(validation_generator.samples / BATCH_SIZE)
)


Epoch 1/10


TypeError: 'numpy.float64' object cannot be interpreted as an integer

In [None]:
# --- Plot Training and Validation Accuracy/Loss ---
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(12, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()


In [None]:
# --- Evaluate the Model on the Validation Set ---
# Evaluate the model
val_loss, val_acc = vgg_model.evaluate(validation_generator)
print(f'Validation accuracy: {val_acc}')

# Get predictions for the validation set
Y_pred = vgg_model.predict(validation_generator)
y_pred = np.argmax(Y_pred, axis=1)

# Get true labels
y_true = validation_generator.classes

# Confusion Matrix
print('Confusion Matrix')
cm = confusion_matrix(y_true, y_pred)
print(cm)

# Classification Report
print('Classification Report')
target_names = list(train_generator.class_indices.keys())
print(classification_report(y_true, y_pred, target_names=target_names))


### Fine-Tuning Strategy

My fine-tuning strategy involves a two-step process. Initially, I freeze the pre-trained convolutional base of the VGG-16 model and only train the newly added, randomly initialized classification layers. This allows the new layers to learn the specific features of the corn leaf disease dataset without disrupting the learned representations in the convolutional base. Once the new layers have converged, I will unfreeze some of the top layers of the convolutional base and continue training the entire network with a very low learning rate. This second step allows the model to "fine-tune" the pre-trained features to the specific dataset, potentially leading to a further increase in performance. This approach prevents large, random weight updates from destroying the pre-trained weights in the early stages of training and allows for a more gradual and effective adaptation of the model to the new task.
