In [1]:
import os

PROJECT = !gcloud config list --format 'value(core.project)'
PROJECT = PROJECT[0]
BUCKET = PROJECT+"-capstone"
REGION = "us-central1"

os.environ["BUCKET"] = BUCKET
os.environ["REGION"] = REGION

In [2]:
# get train 

from google.cloud import storage
from collections import defaultdict
import os
import re
import random

# Initialize the storage client
storage_client = storage.Client()

# Set bucket name from environment variable
bucket_name = os.environ["BUCKET"]
bucket = storage_client.bucket(bucket_name)

image_folder = "train"

# List all image files in the specified folder
blobs = bucket.list_blobs(prefix=image_folder)

image_urls = []
labels = []
images = []

# Function to extract label from the blob name
def extract_label(blob_name):
    # Example regex to extract label: 'train/category/image.jpg'
    match = re.search(r'train/([^/]+)/.*', blob_name)
    return match.group(1) if match else 'unknown'

# Dictionary to keep track of image counts per label
label_counts = defaultdict(int)

# Dictionary to set a random limit for each label
# label_limits = defaultdict(lambda: random.randint(500, 700))

# Collect image URLs and their labels, limit to a random number between 600 and 700 per label
for blob in blobs:
    if blob.name.lower().endswith(('.png', '.jpg', '.jpeg')) and blob.name.lower().startswith('train/tomato'):
        label = extract_label(blob.name)
        # if label_counts[label] < label_limits[label]:
        image_urls.append(f"gs://{bucket_name}/{blob.name}")
        labels.append(label)
        label_counts[label] += 1

print(f"Found {len(image_urls)} images.")

# Print the count of images for each label
for label, count in label_counts.items():
    print(f"Label: {label}, Number of Images: {count}")

Found 18345 images.
Label: Tomato___Bacterial_spot, Number of Images: 1702
Label: Tomato___Early_blight, Number of Images: 1920
Label: Tomato___Late_blight, Number of Images: 1851
Label: Tomato___Leaf_Mold, Number of Images: 1882
Label: Tomato___Septoria_leaf_spot, Number of Images: 1745
Label: Tomato___Spider_mites Two-spotted_spider_mite, Number of Images: 1741
Label: Tomato___Target_Spot, Number of Images: 1827
Label: Tomato___Tomato_Yellow_Leaf_Curl_Virus, Number of Images: 1961
Label: Tomato___Tomato_mosaic_virus, Number of Images: 1790
Label: Tomato___healthy, Number of Images: 1926


In [3]:
# get validation 

from google.cloud import storage
from collections import defaultdict
import os
import re
import random

# Initialize the storage client
storage_client = storage.Client()

# Set bucket name from environment variable
bucket_name = os.environ["BUCKET"]
bucket = storage_client.bucket(bucket_name)

image_folder = "valid"

# List all image files in the specified folder
blobs = bucket.list_blobs(prefix=image_folder)

val_image_urls = []
val_labels = []
val_images = []

# Function to extract label from the blob name
def extract_label(blob_name):
    # Example regex to extract label: 'train/category/image.jpg'
    match = re.search(r'valid/([^/]+)/.*', blob_name)
    return match.group(1) if match else 'unknown'

# Dictionary to keep track of image counts per label
label_counts = defaultdict(int)

# Dictionary to set a random limit for each label
# label_limits = defaultdict(lambda: random.randint(50, 90))

# Collect image URLs and their labels, limit to a random number between 600 and 700 per label
for blob in blobs:
    if blob.name.lower().endswith(('.png', '.jpg', '.jpeg')) and blob.name.lower().startswith('valid/tomato'):
        label = extract_label(blob.name)
        # if label_counts[label] < label_limits[label]:
        val_image_urls.append(f"gs://{bucket_name}/{blob.name}")
        val_labels.append(label)
        label_counts[label] += 1

print(f"Found {len(val_image_urls)} images.")

# Print the count of images for each label
for label, count in label_counts.items():
    print(f"Label: {label}, Number of Images: {count}")


Found 4585 images.
Label: Tomato___Bacterial_spot, Number of Images: 425
Label: Tomato___Early_blight, Number of Images: 480
Label: Tomato___Late_blight, Number of Images: 463
Label: Tomato___Leaf_Mold, Number of Images: 470
Label: Tomato___Septoria_leaf_spot, Number of Images: 436
Label: Tomato___Spider_mites Two-spotted_spider_mite, Number of Images: 435
Label: Tomato___Target_Spot, Number of Images: 457
Label: Tomato___Tomato_Yellow_Leaf_Curl_Virus, Number of Images: 490
Label: Tomato___Tomato_mosaic_virus, Number of Images: 448
Label: Tomato___healthy, Number of Images: 481


In [4]:
import numpy as np

CLASS_NAMES = np.array(
    ["Tomato___Bacterial_spot", 
    "Tomato___Early_blight", 
    "Tomato___Late_blight",
    "Tomato___Leaf_Mold",
    "Tomato___Septoria_leaf_spot",
    "Tomato___Spider_mites Two-spotted_spider_mite",
    "Tomato___Target_Spot",
    "Tomato___Tomato_Yellow_Leaf_Curl_Virus",
    "Tomato___Tomato_mosaic_virus",
    "Tomato___healthy"]
    )

print(f"These are {len(CLASS_NAMES)} available classes:", CLASS_NAMES)

These are 10 available classes: ['Tomato___Bacterial_spot' 'Tomato___Early_blight' 'Tomato___Late_blight'
 'Tomato___Leaf_Mold' 'Tomato___Septoria_leaf_spot'
 'Tomato___Spider_mites Two-spotted_spider_mite' 'Tomato___Target_Spot'
 'Tomato___Tomato_Yellow_Leaf_Curl_Virus' 'Tomato___Tomato_mosaic_virus'
 'Tomato___healthy']


In [5]:
import numpy as np

CLASS_NAMES = np.array(
    ["Tomato___Bacterial_spot", 
"Tomato___Early_blight", 
"Tomato___Late_blight",
"Tomato___Leaf_Mold",
"Tomato___Septoria_leaf_spot",
"Tomato___Spider_mites Two-spotted_spider_mite",
"Tomato___Target_Spot",
"Tomato___Tomato_Yellow_Leaf_Curl_Virus",
"Tomato___Tomato_mosaic_virus",
"Tomato___healthy"]
)

print(f"These are {len(CLASS_NAMES)} available classes:", CLASS_NAMES)

These are 10 available classes: ['Tomato___Bacterial_spot' 'Tomato___Early_blight' 'Tomato___Late_blight'
 'Tomato___Leaf_Mold' 'Tomato___Septoria_leaf_spot'
 'Tomato___Spider_mites Two-spotted_spider_mite' 'Tomato___Target_Spot'
 'Tomato___Tomato_Yellow_Leaf_Curl_Virus' 'Tomato___Tomato_mosaic_virus'
 'Tomato___healthy']


In [6]:
import tensorflow as tf
SHUFFLE_BUFFER = 1000 
batch_size = 128
MAX_DELTA = 63.0 / 255.0  # Change brightness by at most 17.7%
CONTRAST_LOWER = 0.2
CONTRAST_UPPER = 1.8


# Function to load and preprocess an image
def read_and_preprocess(file_path, label_str, random_augment=False):
    if random_augment:
        image = tf.io.read_file(file_path)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.random_brightness(image, MAX_DELTA)
        image = tf.image.random_contrast(image, CONTRAST_LOWER, CONTRAST_UPPER)
        image = tf.image.resize(image, [224, 224])  # Adjust to your target size
        image = tf.cast(image, tf.float32) / 255.0  # Normalize to [0, 1]
    else: 
        image = tf.io.read_file(file_path)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, [224, 224])
        image = tf.cast(image, tf.float32) / 255.0
    label = tf.math.equal(CLASS_NAMES, label_str)
    return image, label

def read_and_preprocess_with_augment(file_path, label_str):
    return read_and_preprocess(file_path, label_str, random_augment=True)

# def load_dataset(img_urls, lbls, batch_size, training=True):
#     dataset = tf.data.Dataset.from_tensor_slices((img_urls, lbls)).cache()
#     if training:
#         dataset = dataset.shuffle(SHUFFLE_BUFFER)
#         dataset = dataset.map(read_and_preprocess_with_augment, num_parallel_calls=tf.data.experimental.AUTOTUNE)
#         dataset = dataset.repeat()
#     else:
#         dataset = dataset.map(read_and_preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
#         dataset = dataset.repeat(1)
        
#     dataset = dataset.batch(batch_size)
#     dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
#     return dataset

def load_dataset(img_urls, lbls, batch_size, training=True):
    dataset = tf.data.Dataset.from_tensor_slices((img_urls, lbls)).cache()
    if training:
        dataset = dataset.shuffle(len(image_urls))
        dataset = dataset.map(read_and_preprocess_with_augment, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.repeat()
    else:
        dataset = dataset.map(read_and_preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.repeat(1)
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset
    

# Load training and validation datasets
train_dataset = load_dataset(image_urls, labels, batch_size, training=True)
val_dataset = load_dataset(val_image_urls, val_labels, batch_size, training=False)



2024-06-27 04:31:56.141459: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-27 04:31:57.948190: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-27 04:31:57.950343: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-

In [9]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50, NASNetMobile, EfficientNetB0
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau



# Define the ResNet50 model
base_model = NASNetMobile(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(len(CLASS_NAMES), activation='softmax')(x)  # Adjust output layer to number of classes

model = Model(inputs=base_model.input, outputs=predictions)

# Freeze the layers of ResNet50 base model
for layer in base_model.layers:
    layer.trainable = False

# # Compile the model
# model.compile(optimizer=Adam(learning_rate=0.0001, weight_decay = 0.0001), 
#               loss='categorical_crossentropy', 
#               metrics=['accuracy'])

# # Train the model
# history = model.fit(train_dataset,
#                     epochs=10,  # Set the number of epochs as needed
#                     steps_per_epoch=len(image_urls) // batch_size,
#                     validation_data=val_dataset,
#                     validation_steps=len(val_image_urls) // batch_size)

# Unfreeze some layers and fine-tune the model
for layer in base_model.layers[-32:]:  # Unfreeze the last 10 layers
    layer.trainable = True

# Recompile the model with a lower learning rate for fine-tuning
model.compile(optimizer=Adam(learning_rate=0.0001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model_checkpoint = ModelCheckpoint(filepath='../model/NASNet_cp.hdf5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.7,patience=2,verbose=1,min_lr=0.0001)
# Continue training with fine-tuning
history_fine = model.fit(train_dataset,
                         epochs=20,  # Continue for more epochs if necessary
                         steps_per_epoch=len(image_urls) // batch_size,
                         validation_data=val_dataset,
                         validation_steps=len(val_image_urls) // batch_size,
                         callbacks = [early_stopping,reduce_lr, model_checkpoint])

Epoch 1/20

2024-06-27 04:37:29.126051: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [4585]
	 [[{{node Placeholder/_1}}]]
2024-06-27 04:37:29.126392: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [4585]
	 [[{{node Placeholder/_0}}]]



Epoch 1: val_loss improved from inf to 0.74286, saving model to ../model/NASNet_cp.hdf5
Epoch 2/20
Epoch 2: val_loss improved from 0.74286 to 0.60937, saving model to ../model/NASNet_cp.hdf5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.60937
Epoch 4/20
Epoch 4: val_loss did not improve from 0.60937
Epoch 5/20
Epoch 5: val_loss improved from 0.60937 to 0.55386, saving model to ../model/NASNet_cp.hdf5
Epoch 6/20
Epoch 6: val_loss improved from 0.55386 to 0.49796, saving model to ../model/NASNet_cp.hdf5
Epoch 7/20
Epoch 7: val_loss improved from 0.49796 to 0.48883, saving model to ../model/NASNet_cp.hdf5
Epoch 8/20
Epoch 8: val_loss did not improve from 0.48883
Epoch 9/20
Epoch 9: val_loss improved from 0.48883 to 0.38403, saving model to ../model/NASNet_cp.hdf5
Epoch 10/20
Epoch 10: val_loss improved from 0.38403 to 0.33020, saving model to ../model/NASNet_cp.hdf5
Epoch 11/20
Epoch 11: val_loss improved from 0.33020 to 0.31258, saving model to ../model/NASNet_cp.hdf5
Epoch 12/20


In [19]:
from keras.layers import Dense, Input, Activation, add, Add, Dropout, BatchNormalization, GlobalAveragePooling2D
from keras.models import Sequential, Model


model = Sequential()
model.add(model_ENB0)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))
model.add(Dense(len(CLASS_NAMES),activation="softmax"))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb0 (Functional)  (None, 7, 7, 1280)       4049571   
                                                                 
 global_average_pooling2d_7   (None, 1280)             0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dropout_2 (Dropout)         (None, 1280)              0         
                                                                 
 dense_11 (Dense)            (None, 10)                12810     
                                                                 
Total params: 4,062,381
Trainable params: 12,810
Non-trainable params: 4,049,571
_________________________________________________________________


In [22]:
from keras.layers import Dense, Input, Activation, add, Add, Dropout, BatchNormalization, GlobalAveragePooling2D
from keras.models import Sequential, Model

# Create Model
model_ENB0 = EfficientNetB0(include_top=False, weights='imagenet', input_shape=(224,224,3))
model_ENB0.trainable = False
model_ENB0.summary()
model = Sequential()
model.add(model_ENB0)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))
model.add(Dense(len(CLASS_NAMES),activation="softmax"))
model.summary()

for layer in model.layers[-32:]:  # Unfreeze the last 10 layers
    layer.trainable = True

model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model_checkpoint = ModelCheckpoint(filepath='../model/EfficientNetB0_cp.hdf5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.7,patience=2,verbose=1,min_lr=0.0001)

# Continue training with fine-tuning
history_fine = model.fit(train_dataset,
                         epochs=20,  # Continue for more epochs if necessary
                         steps_per_epoch=len(image_urls) // batch_size,
                         validation_data=val_dataset,
                         validation_steps=len(val_image_urls) // batch_size,
                         callbacks = [early_stopping,reduce_lr, model_checkpoint])

Epoch 1/20


2024-06-27 06:37:07.998206: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape insequential_2/efficientnetb0/block2b_drop/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 1: val_loss improved from inf to 2.31585, saving model to ../model/EfficientNetB0_cp.hdf5


TypeError: Unable to serialize [2.0896919 2.1128857 2.1081853] to JSON. Unrecognized type <class 'tensorflow.python.framework.ops.EagerTensor'>.

In [25]:
model_ENB0 = EfficientNetB0(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
model_ENB0.trainable = False
# model_ENB0.summary()

model = Sequential()
model.add(model_ENB0)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))
model.add(Dense(len(CLASS_NAMES), activation="softmax"))
# model.summary()

for layer in model.layers[-32:]:  # Unfreeze the last 10 layers
    layer.trainable = True
    
# Compile Model
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Callbacks
model_checkpoint = ModelCheckpoint(filepath='../model/EfficientNetB0_cp.hdf5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=2, verbose=1, min_lr=0.0001)

# Continue training with fine-tuning
history_fine = model.fit(train_dataset,
                         epochs=20,
                         steps_per_epoch=len(image_urls) // batch_size,
                         validation_data=val_dataset,
                         validation_steps=len(val_image_urls) // batch_size,
                         callbacks=[early_stopping, reduce_lr, model_checkpoint])

Epoch 1/20


2024-06-27 06:46:57.958475: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape insequential_5/efficientnetb0/block2b_drop/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 1: val_loss improved from inf to 2.33419, saving model to ../model/EfficientNetB0_cp.hdf5


TypeError: Unable to serialize [2.0896919 2.1128857 2.1081853] to JSON. Unrecognized type <class 'tensorflow.python.framework.ops.EagerTensor'>.