# MODELLING AND EVALUATION NOTEBOOK

## Objectives
The client is interested in predicting if a cherry leaf is healthy or contains powdery mildew.

## INPUTS

project5_cherryleaves/jupyter_notebooks/inputs/train project5_cherryleaves/jupyter_notebooks/inputs/test project5_cherryleaves/jupyter_notebooks/inputs/validation image shape embeddings

## OUTPUTS

## INPUTS

Images distribution plot in train, validation, and test set
Image augmentation
Class indices to change prediction inference in labels
Machine learning model creation and training
Save model
Learning curve plot for model performance
Model evaluation on pickle file
Prediction on the random image file

## Additional Comments | Insights | Conclusions

## Set Working directory

In [None]:
import os

cwd = os.getcwd()
print("Current working directory:", cwd)


In [None]:
work_dir = os.getcwd()
work_dir

## Set input directories

In [None]:
my_data_dir = '/workspace/project5_cherryleaves/jupyter_notebooks/inputs'
train_path = my_data_dir + '/train'
val_path = my_data_dir + '/validation'
test_path = my_data_dir + '/test'

print("Train directory:", train_path)
print("Validation directory:", val_path)
print("Test directory:", test_path)

## Number of images in train, test and validation data

## Importing necessary libraries and modules

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Define the directory containing the data
my_data_dir = '/workspace/project5_cherryleaves/jupyter_notebooks/inputs'

# Define the labels
labels = ['healthy', 'powdery_mildew']




## Creating an empty DataFrame to store label frequencies

In [None]:
df_freq = pd.DataFrame([])

## Counting frequencies of labels in different sets

In [None]:
for folder in ['train', 'validation', 'test']:
    for label in labels:
        # Count the number of images in the current label folder
        num_images = len(os.listdir(os.path.join(my_data_dir, folder, label)))
        
        # Append the label frequency to the DataFrame
        df_freq = df_freq.append(pd.Series({
            'Set': folder,
            'Label': label,
            'Frequency': num_images
        }), ignore_index=True)
        
        # Print the frequency for each label in each set
        print(f"* {folder} - {label}: {num_images} images")


## Plotting the label distribution

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(8, 5))
sns.barplot(data=df_freq, x='Set', y='Frequency', hue='Label')


## Saving the plot and defining the directory where the plot should be saved

In [None]:
save_dir = '/workspace/project5_cherryleaves/jupyter_notebooks/outputs/modelling'

# Create the 'modelling' folder if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the plot
plt.savefig(f'{save_dir}/labels_distribution.png', bbox_inches='tight', dpi=150)
plt.show()

# Image Data Augmentation

## Importing necessary libraries and modules

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import joblib
import os


## Define batch size and image augmentation parameters

In [None]:
batch_size = 20
image_shape = (256, 256, 3)

augmented_image_data = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.10, 
    height_shift_range=0.10,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest',
    rescale=1./255
)


## Define image generators for validation and test sets without augmentation

In [None]:
validation_image_data = ImageDataGenerator(rescale=1./255)
test_image_data = ImageDataGenerator(rescale=1./255)

validation_set = validation_image_data.flow_from_directory(
    val_path,
    target_size=image_shape[:2],
    color_mode='rgb',
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False
)

test_set = test_image_data.flow_from_directory(
    test_path,
    target_size=image_shape[:2],
    color_mode='rgb',
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False
)


## Generate augmented training set

In [None]:
augmented_image_data = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest',
    rescale=1./255
)


In [None]:
my_data_dir = '/workspace/project5_cherryleaves/jupyter_notebooks/inputs'
train_path = os.path.join(my_data_dir, 'train')

train_set = augmented_image_data.flow_from_directory(
    train_path,
    target_size=image_shape[:2],
    color_mode='rgb',
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True
)


## Print class indices

In [None]:
print("Train Set Class Indices:", train_set.class_indices)
print("Validation Set Class Indices:", validation_set.class_indices)
print("Test Set Class Indices:", test_set.class_indices)


## Visualize sample images

In [None]:
def visualize_sample_images(data_generator, num_samples=3):
    class_labels = ['healthy', 'powdery mildew']  
    for label_idx, label in enumerate(class_labels):
        print(f"Sample images for class '{label}':")
        for _ in range(num_samples):
            img, _ = data_generator.next()
            print("Image Shape:", img.shape)
            plt.imshow(img[0])
            plt.title(label)
            plt.show()

# Visualize sample images for training, validation, and test sets
visualize_sample_images(train_set)
visualize_sample_images(validation_set)
visualize_sample_images(test_set)


In [None]:
save_dir = '/workspace/project5_cherryleaves/jupyter_notebooks/outputs/image_augmentation'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

joblib.dump(value=train_set.class_indices, filename=os.path.join(save_dir, 'class_indices.pkl'))


MODEL CREATION

MODEL CREATION

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
image_shape = (256, 256, 3)  
batch_size = 20  


In [None]:
def create_tf_model():
    print("Creating TensorFlow model...")
    model = Sequential()
    model.add(Conv2D(filters=32, kernel_size=(3,3), input_shape=image_shape, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=64, kernel_size=(3,3), input_shape=image_shape, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(filters=64, kernel_size=(3,3), input_shape=image_shape, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))  # Two classes: powdery mildew and healthy
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print("Model created successfully.")
    return model


In [None]:
print("Model Summary:")
create_tf_model().summary()


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3)


In [None]:
print("\nLoading data generators...")
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.10, 
    height_shift_range=0.10,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest',
    rescale=1./255
)

train_set = train_datagen.flow_from_directory(
    train_path,
    target_size=image_shape[:2],
    color_mode='rgb',
    batch_size=batch_size,
    class_mode='categorical',  
    shuffle=True
)

validation_set = ImageDataGenerator(rescale=1./255).flow_from_directory(
    val_path,
    target_size=image_shape[:2],
    color_mode='rgb',
    batch_size=batch_size,
    class_mode='categorical',  
    shuffle=False
)
print("Data generators loaded successfully.")


In [None]:
print("\nTraining the model...")
model = create_tf_model()
history = model.fit(
    train_set,
    epochs=25,
    steps_per_epoch=len(train_set.classes) // batch_size,
    validation_data=validation_set,
    callbacks=[early_stop],
    verbose=1
)
print("Model training completed.")

In [None]:
print("\nSaving the model...")
model.save('outputs/v1/cherry_leaves_model.keras')
print("Model saved successfully.")


In [None]:
import pandas as pd

print("\nPlotting model training losses...")
losses = pd.DataFrame(history.history)
sns.set_style("whitegrid")
losses[['loss', 'val_loss']].plot(style='.-')
plt.title("Loss")
plt.savefig('outputs/v1/model_training_losses.png', bbox_inches='tight', dpi=150)
plt.show()


In [None]:
print("\nPlotting model training accuracy...")
losses[['accuracy', 'val_accuracy']].plot(style='.-')
plt.title("Accuracy")
plt.savefig('outputs/v1/model_training_acc.png', bbox_inches='tight', dpi=150)
plt.show()


In [None]:
print("\nLoading a saved model for prediction...")
model = tf.keras.models.load_model('outputs/v1/cherry_leaves_model.h5')
print("Model loaded successfully.")



In [None]:
from tensorflow.keras.preprocessing import image

labels = ["healthy", "powdery_mildew"]

pointer = 66 
label = labels[0]  # Assuming "healthy" is the first class in your labels list

pil_image = image.load_img(
    os.path.join(test_path, label, os.listdir(os.path.join(test_path, label))[pointer]),
    target_size=image_shape,
    color_mode='rgb'
)
print(f'Image shape: {pil_image.size}, Image mode: {pil_image.mode}')
pil_image

# Convert image to array and prepare for prediction
print("\nConverting image to array and preparing for prediction...")
my_image = image.img_to_array(pil_image)
my_image = np.expand_dims(my_image, axis=0) / 255
print("Image converted and prepared for prediction successfully.")

# Predict class probabilities
print("\nPredicting class probabilities...")
pred_proba = model.predict(my_image)[0]  
predicted_class_index = np.argmax(pred_proba)
predicted_class = labels[predicted_class_index]
pred_probability = pred_proba[predicted_class_index]

print(f"Predicted Probability: {pred_probability}")
print(f"Predicted Class: {predicted_class}")


In [None]:
print("\nConverting image to array and preparing for prediction...")
my_image = image.img_to_array(pil_image)
my_image = np.expand_dims(my_image, axis=0) / 255
print("Image converted and prepared for prediction successfully.")


In [None]:
print("\nPredicting class probabilities...")
pred_proba = model.predict(my_image)[0]  
predicted_class_index = np.argmax(pred_proba)
predicted_class = labels[predicted_class_index]
pred_probability = pred_proba[predicted_class_index]

print(f"Predicted Probability: {pred_probability}")
print(f"Predicted Class: {predicted_class}")


In [None]:
!git status

In [None]:
!git add .

In [None]:
!git commit -m ""

In [None]:
!git push