<a href="https://colab.research.google.com/github/eyobedb/Multimodal-papaya-disease-classification-Leveraging-Computer-vision-and-NLP/blob/main/XAI_Multi_modal_papaya.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, applications
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import os

In [None]:
# -----------------------------
# CONFIGURATION
# -----------------------------
IMAGE_SIZE = (128, 128)
BATCH_SIZE = 16
EPOCHS = 10
NUM_CLASSES = 4  # Black_spot, Powdery_mildew, Ring_spot, Healthy
MAX_LEN = 50


In [None]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

# -----------------------------
# 1. LOAD IMAGE DATA
# -----------------------------

In [None]:
train_dir = "/content/drive/MyDrive/Dataset/Train"
test_dir = "/content/drive/MyDrive/Dataset/Test"

datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_image_gen = datagen.flow_from_directory(
    train_dir,
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True,
    seed=42
)



Found 5760 images belonging to 4 classes.


In [None]:
test_image_gen = datagen.flow_from_directory(
    test_dir,
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

Found 1600 images belonging to 4 classes.


# -----------------------------
# 2. CREATE TEXT DATA
# -----------------------------
# (Descriptions for each class)

In [None]:

text_descriptions = {
    'Black_spot_papaya': 'Leaves covered with black spots indicating black spot infection.',
    'Powdery_mildew': 'Papaya leaves showing powdery mildew infection typical of powdery disease.',
    'Ring_spot_papaya': 'Ring spot disease on papaya leaves indicating ring spot disease.',
    'Healthy_papaya': 'Healthy Papaya leaf with no visible signs of disease or damage.'
}

In [None]:
# Assign each image its class text
train_texts = [text_descriptions[os.path.basename(os.path.dirname(p))] for p in train_image_gen.filepaths]
test_texts  = [text_descriptions[os.path.basename(os.path.dirname(p))] for p in test_image_gen.filepaths]


# -----------------------------
# 3. TEXT TOKENIZATION
# -----------------------------

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(text_descriptions.values()))

train_seq = tokenizer.texts_to_sequences(train_texts)
test_seq  = tokenizer.texts_to_sequences(test_texts)

train_pad = pad_sequences(train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
test_pad  = pad_sequences(test_seq,  maxlen=MAX_LEN, padding='post', truncating='post')
vocab_size = len(tokenizer.word_index) + 1

# -----------------------------
# 4. BUILD IMAGE MODEL (MobileNetV2)
# -----------------------------

In [None]:
image_base = applications.MobileNetV2(weights='imagenet', include_top=False,
                                   input_shape=(*IMAGE_SIZE, 3))
image_base.trainable = False

image_model = models.Sequential([
    image_base,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3)
])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


# -----------------------------
# 5. BUILD TEXT MODEL (Embedding + LSTM)
# -----------------------------

In [None]:
text_input = layers.Input(shape=(MAX_LEN,), name="text_input")
y = layers.Embedding(vocab_size, 128, input_length=MAX_LEN)(text_input)
y = layers.LSTM(128)(y)
y = layers.Dense(128, activation='relu')(y)
text_model = models.Model(inputs=text_input, outputs=y)


# -----------------------------
# 6. COMBINE IMAGE + TEXT MODELS
# -----------------------------

In [None]:
image_input = layers.Input(shape=(*IMAGE_SIZE, 3), name="image_input")
img_features = image_model(image_input)

combined = layers.concatenate([img_features, text_model.output])
z = layers.Dense(128, activation='relu')(combined)
z = layers.Dropout(0.3)(z)
output = layers.Dense(NUM_CLASSES, activation='softmax')(z)

model = models.Model(inputs=[image_input, text_model.input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# -----------------------------
# 7. CREATE CUSTOM DATASET
# -----------------------------

In [None]:
# -----------------------------
# MULTIMODAL GENERATOR (YIELDS ((image, text), label))
# -----------------------------
def multimodal_generator(image_gen, text_pad):
    i = 0
    n = len(text_pad)
    while True:
        imgs, labels = next(image_gen)
        batch_size = imgs.shape[0]
        texts = text_pad[i:i+batch_size]
        if len(texts) < batch_size:
            extra = batch_size - len(texts)
            texts = np.concatenate([texts, text_pad[0:extra]])
            i = 0
        else:
            i += batch_size
        yield (imgs, texts), labels  # ✅ tuple, not list


# -----------------------------
# WRAP IN TF.DATA.DATASET (TF 2.17 requirement)
# -----------------------------

In [None]:
output_signature = (
    (
        tf.TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32),  # images
        tf.TensorSpec(shape=(None, MAX_LEN), dtype=tf.int32)          # texts
    ),
    tf.TensorSpec(shape=(None, NUM_CLASSES), dtype=tf.float32)        # labels
)

train_ds = tf.data.Dataset.from_generator(
    lambda: multimodal_generator(train_image_gen, train_pad),
    output_signature=output_signature
)
test_ds = tf.data.Dataset.from_generator(
    lambda: multimodal_generator(test_image_gen, test_pad),
    output_signature=output_signature
)

# -----------------------------
# 8. TRAIN MODEL
# -----------------------------

In [None]:
steps_per_epoch = len(train_image_gen)
val_steps = len(test_image_gen)

history = model.fit(
    train_ds,
    steps_per_epoch=steps_per_epoch,
    epochs=EPOCHS,
    validation_data=test_ds,
    validation_steps=val_steps
)

Epoch 1/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1238s[0m 7s/step - accuracy: 0.7880 - loss: 0.5462 - val_accuracy: 0.9663 - val_loss: 0.0923
Epoch 2/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 873ms/step - accuracy: 0.9779 - loss: 0.0725 - val_accuracy: 0.9781 - val_loss: 0.0607
Epoch 3/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 874ms/step - accuracy: 0.9901 - loss: 0.0283 - val_accuracy: 0.9837 - val_loss: 0.0573
Epoch 4/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 1s/step - accuracy: 0.9957 - loss: 0.0156 - val_accuracy: 0.9781 - val_loss: 0.0972
Epoch 5/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 878ms/step - accuracy: 0.9921 - loss: 0.0217 - val_accuracy: 0.9862 - val_loss: 0.0588
Epoch 6/10
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 872ms/step - accuracy: 0.9948 - loss: 0.0139 - val_accuracy: 0.9812 - val_loss: 0.0547
Epoch 7/1

# -----------------------------
# 10. SAVE MODEL
# -----------------------------

In [None]:
model.save("/content/drive/MyDrive/Colab Notebooks/Cv_nlp/Preprocessed/multimodal_papaya_model_v2_tf217.h5")
print("✅ Model training complete and saved successfully!")



✅ Model training complete and saved successfully!


In [None]:
test_dir = "/content/drive/MyDrive/Dataset/Test"
MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/Cv_nlp/Preprocessed/multimodal_papaya_model_v2_tf217.h5"

# -----------------------------
# LOAD TRAINED MODEL
# -----------------------------

In [None]:
model = tf.keras.models.load_model(MODEL_PATH)
print("✅ Model loaded successfully.")



✅ Model loaded successfully.


# -----------------------------
# IMAGE DATA GENERATOR
# -----------------------------

In [None]:
datagen = ImageDataGenerator(rescale=1./255)
test_image_gen = datagen.flow_from_directory(
    test_dir,
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

Found 1600 images belonging to 4 classes.


In [None]:
# -----------------------------
# TEXT DESCRIPTIONS PER CLASS
# -----------------------------
text_descriptions = {
    'Black_spot_papaya': 'Leaves covered with black spots indicating black spot infection.',
    'Powdery_mildew': 'Papaya leaves showing powdery mildew infection typical of powdery disease.',
    'Ring_spot_papaya': 'Ring spot disease on papaya leaves indicating ring spot disease.',
    'Healthy_papaya': 'Healthy Papaya leaf with no visible signs of disease or damage.'
}


In [None]:

test_texts = [text_descriptions[os.path.basename(os.path.dirname(p))] for p in test_image_gen.filepaths]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(text_descriptions.values()))
test_seq = tokenizer.texts_to_sequences(test_texts)
test_pad = pad_sequences(test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# -----------------------------
# TOKENIZE TEXTS (same logic as training)
# -----------------------------

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(text_descriptions.values()))
test_seq = tokenizer.texts_to_sequences(test_texts)
test_pad = pad_sequences(test_seq, maxlen=MAX_LEN, padding='post', truncating='post')


# -----------------------------
# DEFINE TEST GENERATOR
# -----------------------------

In [None]:
def multimodal_generator():
    for i in range(len(test_image_gen)):
        imgs, labels = test_image_gen[i]
        start = i * BATCH_SIZE
        end = start + imgs.shape[0]
        texts = test_pad[start:end]
        yield ({"image_input": imgs, "text_input": texts}, labels)


# -----------------------------
# WRAP IN TF.DATA.DATASET
# -----------------------------

In [None]:
output_signature = (
    {
        "image_input": tf.TensorSpec(shape=(None, *IMAGE_SIZE, 3), dtype=tf.float32),
        "text_input": tf.TensorSpec(shape=(None, MAX_LEN), dtype=tf.int32),
    },
    tf.TensorSpec(shape=(None, NUM_CLASSES), dtype=tf.float32)
)

test_ds = tf.data.Dataset.from_generator(
    multimodal_generator,
    output_signature=output_signature
)


# -----------------------------
# EVALUATE MODEL
# -----------------------------

In [None]:
print("\n🚀 Evaluating model on test dataset...")
results = model.evaluate(test_ds, verbose=1)


🚀 Evaluating model on test dataset...
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 286ms/step - accuracy: 0.9634 - loss: 0.1860




In [None]:
loss, acc = results[0], results[1]
print(f"\n✅ Test Accuracy: {acc*100:.2f}%")
print(f"📉 Test Loss: {loss:.4f}")


✅ Test Accuracy: 97.37%
📉 Test Loss: 0.1503



# -----------------------------
# GET PREDICTIONS (OPTIONAL)
# -----------------------------

In [None]:
preds = model.predict(test_ds, verbose=1)
pred_classes = np.argmax(preds, axis=1)
true_classes = test_image_gen.classes
class_labels = list(test_image_gen.class_indices.keys())

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 269ms/step


# Confusion Matrix

In [None]:
cm = confusion_matrix(true_classes, pred_classes)
cr = classification_report(true_classes, pred_classes, target_names=class_labels)

In [None]:
print("\n📊 Confusion Matrix:")
print(cm)
print("\n🧾 Classification Report:")
print(cr)


📊 Confusion Matrix:
[[377  14   4   5]
 [  0 397   0   3]
 [  0   6 388   6]
 [  0   0   4 396]]

🧾 Classification Report:
                   precision    recall  f1-score   support

Black_spot_papaya       1.00      0.94      0.97       400
   Healthy_papaya       0.95      0.99      0.97       400
   Powdery_mildew       0.98      0.97      0.97       400
 Ring_spot_papaya       0.97      0.99      0.98       400

         accuracy                           0.97      1600
        macro avg       0.97      0.97      0.97      1600
     weighted avg       0.97      0.97      0.97      1600

