In [1]:
# Cell 1 - Imports and Setup
import tensorflow as tf
import numpy as np
import os
import gc
from pathlib import Path
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.mixed_precision import set_global_policy


# Memory management
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")

# Enable mixed precision
set_global_policy('mixed_float16')

2024-12-28 22:51:34.493493: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-28 22:51:34.524411: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735401094.585077   14800 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735401094.592639   14800 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-28 22:51:34.668929: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

GPU memory growth enabled


In [2]:
# Cell 2 - Configuration
IMG_SIZE = (600, 600)  # Increased for B7
BATCH_SIZE = 4  # Reduced for B7
EPOCHS = 20
DATA_DIR = "/mnt/c/Users/Aufa_Mulyana/003/VisKom/Cassava"
MODEL_PATH = "/mnt/c/Users/Aufa_Mulyana/003/VisKom/Models/cassava_model_b7.keras"

In [3]:
# Cell 3 - Data Generators
def create_generators():
    # Enhanced data augmentation for training
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        shear_range=0.3,
        zoom_range=0.3,
        horizontal_flip=True,
        vertical_flip=True,
        brightness_range=[0.8, 1.2],
        fill_mode='nearest',
        validation_split=0.2
    )
    
    # Only rescaling for validation
    valid_datagen = ImageDataGenerator(
        rescale=1./255,
        validation_split=0.2
    )
    
    # Training generator
    train_generator = train_datagen.flow_from_directory(
        DATA_DIR,
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='sparse',
        subset='training',
        shuffle=True
    )
    
    # Validation generator
    valid_generator = valid_datagen.flow_from_directory(
        DATA_DIR,
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='sparse',
        subset='validation',
        shuffle=False
    )
    
    return train_generator, valid_generator


In [4]:
 #Cell 4 - Model Creation
def create_model(num_classes):
    # Clear memory
    gc.collect()
    tf.keras.backend.clear_session()
    
    # Create base model
    base_model = EfficientNetB7(
        weights='imagenet',
        include_top=False,
        input_shape=(*IMG_SIZE, 3)
    )
    
    # Unfreeze some layers
    for layer in base_model.layers[-30:]:
        layer.trainable = True
    
    # Create model
    inputs = tf.keras.Input(shape=(*IMG_SIZE, 3))
    x = base_model(inputs)
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.5)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    
    model = tf.keras.Model(inputs, outputs)
    
    # Compile
    optimizer = Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [5]:
# Cell 5 - Training Setup
def create_callbacks():
    return [
        EarlyStopping(
            monitor='val_accuracy',
            patience=12,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=6,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            MODEL_PATH,
            monitor='val_accuracy',
            save_best_only=True,
            mode='max',
            verbose=1
        )
    ]

In [6]:
# Cell 6 - Training Execution
def resume_training():
    # Create generators
    train_generator, valid_generator = create_generators()
    
    # Load the saved model
    print("Loading saved model from:", "/home/mondhirsch/ml/cassava_model_b7.keras")
    model = tf.keras.models.load_model("/home/mondhirsch/ml/cassava_model_b7.keras")
    
    # Create callbacks (same as before)
    callbacks = create_callbacks()
    
    # Continue training
    history = model.fit(
        train_generator,
        validation_data=valid_generator,
        epochs=EPOCHS,  # Will train for additional epochs from where it left off
        callbacks=callbacks
    )
    
    return model, history


In [7]:
# Cell 7 - Resume Training
model, history = resume_training()

Found 14449 images belonging to 5 classes.
Found 3610 images belonging to 5 classes.
Loading saved model from: /home/mondhirsch/ml/cassava_model_b7.keras


I0000 00:00:1735401103.463990   14800 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1735401103.467371   14800 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5520 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))
  self._warn_if_super_not_called()


Epoch 1/20


I0000 00:00:1735401247.785080   14942 service.cc:148] XLA service 0x7f0048008cd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735401247.787241   14942 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2024-12-28 22:54:14.324498: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1735401287.920154   14942 cuda_dnn.cc:529] Loaded cuDNN version 90300
2024-12-28 22:56:06.919485: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng3{k11=0} for conv (f32[4,288,150,150]{3,2,1,0}, u8[0]{0}) custom-call(f32[4,48,150,150]{3,2,1,0}, f32[288,48,1,1]{3,2,1,0}), window={size=1x1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leaky

[1m3415/3613[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m2:38[0m 800ms/step - accuracy: 0.8845 - loss: 0.3382 

E0000 00:00:1735404283.807375   14942 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1735404283.959435   14942 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1735404284.087541   14942 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2024-12-28 23:44:45.355214: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng3{k11=0} for conv (f32[1,48,150,150]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,288,150,150]{3,2,1,0}, f32[48,288,1,1]{3,2,1,0}), window={size=1x1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result

[1m3613/3613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 838ms/step - accuracy: 0.8843 - loss: 0.3388  

  self._warn_if_super_not_called()




E0000 00:00:1735404902.287525   14946 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1735404902.445712   14946 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1735404902.564853   14946 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2024-12-28 23:55:04.322404: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng3{k11=0} for conv (f32[2,48,150,150]{3,2,1,0}, u8[0]{0}) custom-call(f32[2,288,150,150]{3,2,1,0}, f32[48,288,1,1]{3,2,1,0}), window={size=1x1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward", backend_config={"cudnn_conv_backend_config":


Epoch 1: val_accuracy improved from -inf to 0.85263, saving model to /mnt/c/Users/Aufa_Mulyana/003/VisKom/Models/cassava_model_b7.keras
[1m3613/3613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3850s[0m 955ms/step - accuracy: 0.8843 - loss: 0.3388 - val_accuracy: 0.8526 - val_loss: 0.4832 - learning_rate: 1.0000e-04
Epoch 2/20
[1m3613/3613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8903 - loss: 0.3250     
Epoch 2: val_accuracy did not improve from 0.85263
[1m3613/3613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4902s[0m 1s/step - accuracy: 0.8903 - loss: 0.3250 - val_accuracy: 0.2490 - val_loss: 2.6065 - learning_rate: 1.0000e-04
Epoch 3/20
[1m3613/3613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.8925 - loss: 0.3255     
Epoch 3: val_accuracy did not improve from 0.85263
[1m3613/3613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7079s[0m 2s/step - accuracy: 0.8925 - loss: 0.3255 - val_accuracy: 0.8036 - va

KeyboardInterrupt: 

In [8]:
# Cell 8 - Plot Results
import matplotlib.pyplot as plt

def plot_training_history(history):
    plt.figure(figsize=(12, 4))
    
    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'])
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'])
    
    plt.tight_layout()
    plt.show()

plot_training_history(history)

NameError: name 'history' is not defined