In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# CIFAR

**Project Overview**

Goal:

Build a Custom CNN (no transfer learning) that can classify CIFAR-100 images into 100 classes with ~65% accuracy.

Dataset:

	•	CIFAR-100: 50k train + 10k test, each image is 32×32 RGB, 100 classes.
	•	Example classes: apple, chair, shark, pickup truck, leopard, etc.

Approach:
	
    1.	Load dataset & prepare data pipeline
	2.	Create augmentations
	3.	Build CNN with residual blocks, BatchNorm, Dropout
	4.	Train with smart learning rate schedule
	5.	Evaluate performance
**

In [14]:
# imports

import tensorflow as tf
import numpy as np
import math

In [15]:
from tensorflow.keras import layers, models, regularizers

What’s happening:

	•	tensorflow → our deep learning framework
	•	layers, models, regularizers → Keras tools for building CNNs
    
	•	numpy → for array math
	•	math → for cosine learning rate scheduling

# PART 3 — Hyperparameter

Concept:

Think of these like dials on a control board:

	•	BATCH_SIZE: number of images per step
	•	EPOCHS: how many passes over the training data
	•	MIXUP_ALPHA: controls MixUp augmentation (image blending)
	•	LABEL_SMOOTHING: prevents overconfidence in predictions
	•	WEIGHT_DECAY: small penalty on weights to avoid                   overfitting
    
	•	LR (Learning Rate): how fast the model learns
	•	MOMENTUM: helps SGD optimizer “remember” direction
	•	WARMUP: slowly increase LR early to stabilize training

In [16]:
BATCH_SIZE = 128
IMG_SHAPE = (32, 32, 3)
NUM_CLASSES = 100
EPOCHS = 200

MIXUP_ALPHA = 0.2
LABEL_SMOOTHING = 0.1
WEIGHT_DECAY = 1e-4

INITIAL_LR = 0.1
MOMENTUM = 0.9
WARMUP_EPOCHS = 5

# PART 4 — Data Loading

Concept:

	•	TensorFlow gives you CIFAR-100 ready to go.
	•	Each x = images, y = class labels.
	•	label_mode='fine' → 100 detailed classes (not grouped into 20 superclasses).
	•	tf.squeeze removes unnecessary shape dimensions

In [17]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar100.load_data(label_mode = 'fine')
y_train = tf.squeeze(y_train)
y_test = tf.squeeze(y_test)

# PART 5 — Data Preprocessing (Train vs Test)

Concept:

	•	Normalization: divide by 255 to get pixels between [0,1]
	•	Augmentation (train only): random crop, flip, brightness — makes model more robust
	•	No augmentation on test data, only normalize

In [18]:
 # Training Prep
def preprocess_train(image, label):
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize_with_crop_or_pad(image, 36, 36)
    image = tf.image.random_crop(image, [32, 32, 3])
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, 0.1)
    image = tf.image.random_contrast(image, 0.9, 1.1)
    return image, label

In [19]:
# Eval Prep
def preprocess_eval(image, label):
    image = tf.cast(image, tf.float32) / 255.0
    return image, label

# PART 6 — Build the TensorFlow Dataset Pipeline

Concept:

	•	tf.data.Dataset efficiently feeds data to GPU.
	•	shuffle prevents the model from memorizing the order.
	•	map applies your preprocessing function.
	•	prefetch overlaps data loading with GPU work for speed

In [20]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_ds = train_ds.shuffle(50000)
train_ds = train_ds.map(preprocess_train, num_parallel_calls = tf.data.AUTOTUNE)
train_ds = train_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_ds = test_ds.map(preprocess_eval).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# PART 7 — CNN Building Blocks

Concept:

	•	Conv2D → extracts features (edges, textures, etc.)
	•	BatchNorm → stabilizes and speeds up learning
	•	ReLU → adds non-linearity

In [21]:
def conv_bn_relu(x, filters, kernel_size=3, stride=1):
    x = layers.Conv2D(filters, kernel_size, strides=stride, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    return x

# PART 8 — Residual Block

Concept:

Residual blocks = ResNet magic

They skip connections, letting gradients flow easily through deep networks.

You can think: “learn the change (residual) instead of starting from scratch each layer"

In [22]:
def residual_block(x, filters, stride=1):
    shortcut = x
    out = conv_bn_relu(x, filters, 3, stride)
    out = layers.Conv2D(filters, 3, padding='same', use_bias=False)(out)
    out = layers.BatchNormalization()(out)

    # adjust shortcut size if needed
    if stride != 1 or x.shape[-1] != filters:
        shortcut = layers.Conv2D(filters, 1, strides=stride, use_bias=False)(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)

    x = layers.add([out, shortcut])
    x = layers.ReLU()(x)
    return x

# PART 10 — Full CNN Architecture

Concept:

	•	Deep stacked residual layers → feature extraction
	•	GlobalAveragePooling + GlobalMaxPooling → captures both mean and extreme features
	•	Dense layer → final classifier
	•	Softmax → converts to probabilities

In [23]:
def build_custom_cnn(input_shape=(32,32,3), num_classes=100):
    inputs = layers.Input(shape=input_shape)
    x = conv_bn_relu(inputs, 64)
    x = residual_block(x, 64)
    x = residual_block(x, 128, stride=2)
    x = residual_block(x, 256, stride=2)
    x = residual_block(x, 512, stride=2)
    x = se_block(x)  # optional

    # Advanced pooling: average + max
    gap = layers.GlobalAveragePooling2D()(x)
    gmp = layers.GlobalMaxPooling2D()(x)
    x = layers.Concatenate()([gap, gmp])

    x = layers.Dropout(0.3)(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    return models.Model(inputs, outputs)

# PART 11 — Learning Rate Schedule (Cosine + Warmup

Concept
:
	•	Starts small (warmup) → prevents exploding gradients early
	•	Slowly decays following a cosine wave — helps convergence
	•	Think of it as “start slow, go fast, then cool down smoothl

In [24]:
import tensorflow as tf
import math

class WarmUpCosine(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, base_lr, epochs, steps_per_epoch, warmup_epochs=5, name=None):
        super().__init__()
        self.base_lr = tf.convert_to_tensor(base_lr, dtype=tf.float32)
        self.epochs = int(epochs)
        self.steps_per_epoch = int(steps_per_epoch)
        self.warmup_steps = tf.cast(warmup_epochs * self.steps_per_epoch, tf.float32)
        self.total_steps = tf.cast(self.epochs * self.steps_per_epoch, tf.float32)
        self.name = name or "WarmUpCosine"

    def __call__(self, step):
        # make sure step is float32 tensor
        step = tf.cast(step, tf.float32)

        # Warmup LR: linear ramp from 0 -> base_lr over warmup_steps
        # Avoid division by zero if warmup_steps == 0
        warmup_steps = tf.maximum(self.warmup_steps, 1.0)
        warmup_lr = self.base_lr * (step / warmup_steps)

        # Cosine decay part (after warmup)
        progress = (step - self.warmup_steps) / tf.maximum(1.0, (self.total_steps - self.warmup_steps))
        # clip progress to [0,1]
        progress = tf.clip_by_value(progress, 0.0, 1.0)
        cosine_decay = 0.5 * (1.0 + tf.cos(math.pi * progress))
        cosine_lr = self.base_lr * cosine_decay

        # If step < warmup_steps -> warmup_lr else cosine_lr
        lr = tf.where(step < self.warmup_steps, warmup_lr, cosine_lr)
        # if step > total_steps, keep lr at 0 (optional) — here we keep clipped cosine_lr (already clipped)
        return lr

    def get_config(self):
        return {
            "base_lr": float(self.base_lr.numpy()) if tf.executing_eagerly() else float(self.base_lr),
            "epochs": self.epochs,
            "steps_per_epoch": self.steps_per_epoch,
            "warmup_epochs": int(self.warmup_steps.numpy() // self.steps_per_epoch) if tf.executing_eagerly() else int(self.warmup_steps / self.steps_per_epoch),
            "name": self.name
        }

In [26]:
# steps_per_epoch = math.ceil(len(x_train) / BATCH_SIZE)
# lr_schedule = WarmUpCosine(INITIAL_LR, epochs=EPOCHS, steps_per_epoch=steps_per_epoch, warmup_epochs=WARMUP_EPOCHS)

# optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule, momentum=MOMENTUM, nesterov=True)

# # If you use categorical one-hot labels:
# # loss_fn = tf.keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTHING)
# # otherwise for integer labels:
# # loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

# PART 12 — Compile the Mode

Concept
:
	•	SGD + Momentum → stable, proven optimizer for CNNs
	•	Label smoothing → prevents model from becoming overconfident
	•	Compile → tie the model, loss, and optimizer together before trainin

In [27]:
from tensorflow.keras.datasets import cifar100

(x_train, y_train), (x_test, y_test) = cifar100.load_data(label_mode='fine')

y_train = y_train.squeeze()
y_test = y_test.squeeze()

In [28]:
steps_per_epoch = len(x_train) // BATCH_SIZE

In [29]:
def se_block(x, se_ratio=8):
    filters = x.shape[-1]
    se = tf.keras.layers.GlobalAveragePooling2D()(x)
    se = tf.keras.layers.Dense(filters // se_ratio, activation='relu')(se)
    se = tf.keras.layers.Dense(filters, activation='sigmoid')(se)
    se = tf.keras.layers.Reshape((1,1,filters))(se)
    return tf.keras.layers.multiply([x, se])

In [30]:
steps_per_epoch = len(x_train) // BATCH_SIZE
lr_schedule = WarmUpCosine(INITIAL_LR, EPOCHS, steps_per_epoch)

optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule, momentum=MOMENTUM, nesterov=True)
loss_fn = tf.keras.losses.CategoricalCrossentropy(label_smoothing=LABEL_SMOOTHING)

model = build_custom_cnn()
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# PART 13 — Callback

Concept:
	•	Checkpoint: saves your best weights
	•	ReduceLROnPlateau: lowers LR if loss stops improving
	•	EarlyStopping: stops training early to prevent overfittin

In [31]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_accuracy'),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5),
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True)
]

# Part 14: Train Model

Concept:
* Each epoch = full pass over train data
* Validation used To Track Gen
* Callback Help Tune Automatically While Training

In [32]:
AUTOTUNE = tf.data.AUTOTUNE

In [33]:
# Use original integer labels y_train, y_test (shape (N,)) — do NOT one-hot encode
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_ds = train_ds.map(lambda x,y: (tf.cast(x, tf.float32)/255.0, y), num_parallel_calls=AUTOTUNE)
train_ds = train_ds.map(preprocess_train, num_parallel_calls=AUTOTUNE)   # your augment fn must return (image, int_label)
train_ds = train_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_ds = test_ds.map(lambda x,y: (tf.cast(x, tf.float32)/255.0, y), num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

# Compile with sparse loss (no label smoothing available on older TF)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['sparse_categorical_accuracy'])
# or metrics=['accuracy'] — Keras will infer correct accuracy for sparse targets

In [34]:
# 1) Create optimizer with a float LR
optimizer = tf.keras.optimizers.SGD(learning_rate=INITIAL_LR, momentum=MOMENTUM, nesterov=True)

# 2) Define an epoch-based LR function (returns lr for given epoch)
def epoch_warmup_cosine(epoch):
    # simple epoch-based schedule: linear warmup -> cosine decay across EPOCHS
    warmup = WARMUP_EPOCHS
    if epoch < warmup:
        return float(INITIAL_LR * (epoch + 1) / warmup)  # +1 so epoch0 != 0
    # cosine decay after warmup
    progress = (epoch - warmup) / max(1, (EPOCHS - warmup))
    cosine = 0.5 * (1.0 + math.cos(math.pi * min(1.0, progress)))
    return float(INITIAL_LR * cosine)

# 3) Use LearningRateScheduler (Keras will set optimizer.lr)
lr_callback = tf.keras.callbacks.LearningRateScheduler(epoch_warmup_cosine, verbose=1)

# 4) Use ReduceLROnPlateau as well (it will set lr when plateau occurs)
reduce_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=6, verbose=1, mode='min'
)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True, mode='max', verbose=1),
    lr_callback,
    reduce_on_plateau
]

model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [37]:
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=EPOCHS,
    callbacks=callbacks
)


Epoch 1: LearningRateScheduler setting learning rate to 0.02.
Epoch 1/200
[1m390/391[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 61ms/step - accuracy: 0.0079 - loss: 4.6074
Epoch 1: val_accuracy did not improve from 0.01300
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 65ms/step - accuracy: 0.0079 - loss: 4.6074 - val_accuracy: 0.0127 - val_loss: 4.6540 - learning_rate: 0.0200

Epoch 2: LearningRateScheduler setting learning rate to 0.04.
Epoch 2/200
[1m390/391[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 62ms/step - accuracy: 0.0084 - loss: 4.6067
Epoch 2: val_accuracy did not improve from 0.01300
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 66ms/step - accuracy: 0.0084 - loss: 4.6067 - val_accuracy: 0.0119 - val_loss: 4.6283 - learning_rate: 0.0400

Epoch 3: LearningRateScheduler setting learning rate to 0.06000000000000001.
Epoch 3/200
[1m390/391[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 61ms/step - accura



[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 65ms/step - accuracy: 0.0089 - loss: 4.6081 - val_accuracy: 0.0139 - val_loss: 4.6399 - learning_rate: 0.0999

Epoch 11: LearningRateScheduler setting learning rate to 0.09983786540671051.
Epoch 11/200
[1m390/391[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 60ms/step - accuracy: 0.0085 - loss: 4.6080
Epoch 11: val_accuracy did not improve from 0.01390
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 64ms/step - accuracy: 0.0085 - loss: 4.6080 - val_accuracy: 0.0088 - val_loss: 5.2331 - learning_rate: 0.0998

Epoch 12: LearningRateScheduler setting learning rate to 0.09976658173588243.
Epoch 12/200
[1m390/391[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 60ms/step - accuracy: 0.0083 - loss: 4.6080
Epoch 12: val_accuracy did not improve from 0.01390
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 64ms/step - accuracy: 0.0083 - loss: 4.6080 - val_accuracy: 0.0081 - va

# PART 15 — Evaluate and Visualize

Concept:

	•	Load best model checkpoint
	•	Evaluate on unseen test set
	•	Expect accuracy to gradually climb toward 60–65% with good tunin

In [36]:
model.load_weights('best_model.h5')
test_loss, test_acc = model.evaluate(test_ds)
print("Test Accuracy:", test_acc)

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.0134 - loss: 4.7673
Test Accuracy: 0.013000000268220901
