## Model Training

In this notebook we will:

### Define Loss Function & Optimizer
- **Loss Function:**  
  Use cross-entropy loss for multi-class classification.
- **Optimizer:**  
  Choose between:
  - Stochastic Gradient Descent (SGD)
  - Adam (justify choice based on experimentation or literature).

### Set Hyperparameters
- **Learning Rate:**  
  Start with 0.001 for Adam or 0.01 for SGD.
- **Batch Size:**  
  Typically 32 or 64.
- **Number of Epochs:**  
  Start with 50-100 epochs and monitor for early stopping.

### Implement Training Loop
- **Forward Pass:**  
  Compute predictions.
- **Backward Pass:**  
  Compute gradients and update weights.
- **Logging:**  
  Track training and validation loss/accuracy.

### Early Stopping & Checkpointing
- **Early Stopping:**  
  Monitor validation loss to halt training when performance plateaus.
- **Model Checkpointing:**  
  Save the best model based on validation performance.

## Loading dataset from TFRecord file

In [None]:
import tensorflow as tf

def _parse_raw_image(proto):
    """Parses a TFRecord example to extract image."""
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
    }
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    
    # Decode the image from raw bytes (Assuming JPEG format, adjust if needed)
    image = tf.io.decode_jpeg(parsed_features['image'], channels=3)
    
    return image

# Load previously saved TFRecord dataset
tfrecord_path = "balanced_train_20250218_231144.tfrecord"
raw_dataset = tf.data.TFRecordDataset(tfrecord_path)

def inspect_image_shapes(dataset, n=5):
    """
    Inspects n random images from a TFRecord dataset.
    If all images have the same shape, returns (image_size, num_channels).
    Otherwise, raises an error.

    Args:
        dataset (tf.data.Dataset): Parsed TFRecord dataset.
        n (int): Number of images to check.

    Returns:
        tuple: (image_size, num_channels) if all images match.
        Raises ValueError if shapes are inconsistent.
    """
    image_shapes = set()

    # Parse dataset before checking shapes
    parsed_dataset = dataset.map(_parse_raw_image)

    # Iterate through n images and store their shapes
    for image in parsed_dataset.shuffle(1000).take(n):
        shape = tuple(image.shape)  # Convert TensorShape to tuple
        if len(shape) == 3:  # Ensure shape contains height, width, and channels
            image_shapes.add(shape)

    # Check if all shapes are identical
    if len(image_shapes) == 1:
        height, width, num_channels = list(image_shapes)[0]
        print(f"All images have the same shape: ({height}, {width}) with {num_channels} channels.")
        return (height, width), num_channels
    else:
        raise ValueError(f"Inconsistent image shapes found: {image_shapes}. Ensure uniform image sizes in TFRecord.")


IMAGE_SIZE, NUM_CHANNELS = inspect_image_shapes(raw_dataset, n = 5)
print(f"Image size: {IMAGE_SIZE}, Channels: {NUM_CHANNELS}")



In [None]:

# Function to parse a TFRecord 
def _parse_function(proto):
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    
    image = tf.io.decode_jpeg(parsed_features['image'], channels=NUM_CHANNELS)
    image = tf.image.resize(image, IMAGE_SIZE)  # Resize to uniform shape
    image = image / 255.0  # Normalize to [0,1]
    
    label = parsed_features['label']
    
    return image, label

# Parse dataset
parsed_dataset = raw_dataset.map(_parse_function)

# Inspect the first few labels
for image, label in parsed_dataset.take(5):
    print(label.numpy())  # Print label values


In [None]:
import tensorflow as tf
import glob

# Get a sorted list of exported model directories matching the naming pattern
model_dirs = sorted(glob.glob("my_cnn_model_*"))
if not model_dirs:
    raise FileNotFoundError("No saved model directories found.")

# Select the most recent (latest) model directory
latest_model_dir = model_dirs[-1]
print("Loading the latest model from:", latest_model_dir)

# Load the model
loaded_model = tf.keras.models.load_model(latest_model_dir)
loaded_model.summary()

model = loaded_model

In [None]:
# Define Loss Function & Optimizer
from tensorflow.keras.optimizers import Adam

# Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',  # Using 'sparse_categorical_crossentropy' if labels are integers
    optimizer=Adam(learning_rate=0.0005),  # You can adjust the learning rate as needed
    metrics=['accuracy']  # You can add other metrics like precision and recall if needed
)


In [None]:
# model training

# Shuffle and batch the dataset before training
BATCH_SIZE = 32
train_dataset = parsed_dataset.shuffle(1000).batch(BATCH_SIZE)

# Train the model
model.fit(train_dataset, epochs=10)

In [None]:
import tensorflow as tf

print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("Is GPU available:", tf.test.is_built_with_cuda())
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

#Force TensorFlow to Use GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.set_visible_devices(gpus[0], 'GPU')  # Use only the first GPU
        print("✅ GPU is now enabled for TensorFlow")
    except RuntimeError as e:
        print(e)

In [None]:
import tensorflow as tf

BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE  # Uses optimal number of CPU threads

# Optimized dataset pipeline for fast GPU training
train_dataset = (
    parsed_dataset
    .shuffle(1000)
    .batch(BATCH_SIZE)
    .repeat()  # Ensures dataset doesn't run out during multiple epochs
    .map(lambda x, y: (x, y), num_parallel_calls=AUTOTUNE)  # Remove unnecessary tensor conversion
    .prefetch(AUTOTUNE)  # Keeps GPU utilization high
)

# Train the model with optimized pipeline
model.fit(train_dataset, epochs=10, steps_per_epoch=100)  # Use `steps_per_epoch` to control dataset consumption


In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Hyperparameters
LEARNING_RATE = 0.001
BATCH_SIZE = 32
EPOCHS = 50
AUTOTUNE = tf.data.AUTOTUNE

# Define dataset size (you need to determine this beforehand)
dataset_size = sum(1 for _ in parsed_dataset)  # Count dataset samples

# Split dataset (80% train, 20% validation)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

# Training dataset
train_dataset = (
    parsed_dataset.take(train_size)
    .shuffle(1000)
    .batch(BATCH_SIZE)
    .repeat()
    .prefetch(AUTOTUNE)
)

# Validation dataset
val_dataset = (
    parsed_dataset.skip(train_size)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=LEARNING_RATE),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Implement early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    # patience = 5,
    patience=10, # ytring to increase the 
    restore_best_weights=True
)

# Train the model
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    steps_per_epoch=100,
    validation_steps=20,
    callbacks=[early_stopping]
)


In [None]:
print(f"Training stopped at epoch: {early_stopping.stopped_epoch}")
