<a href="https://colab.research.google.com/github/groove-net/Motive/blob/main/KitchenCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from datasets import load_dataset
dataset = load_dataset("kbanstola/kitchen-utensil-image-dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/787 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/165M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1000
    })
})


In [3]:
IMAGE_MEAN = [0.5, 0.5, 0.5]
IMAGE_STD  = [0.5, 0.5, 0.5]

In [4]:
from torchvision import transforms

# --- Base config ---
IMAGE_SIZE = 224
RESIZE_SIZE = 256

# --- Define transforms ---
train_transform = transforms.Compose([
    transforms.Resize(RESIZE_SIZE),
    transforms.CenterCrop(IMAGE_SIZE),
    transforms.RandomHorizontalFlip(), # Random horizontal flip
    transforms.RandomRotation(10), # Randomly rotates the image by an angle between –10° and +10°
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3), # Randomly changes the brightness, contrast, and saturation of an image during training +/- 30%.
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGE_MEAN, std=IMAGE_STD),
])

val_transform = transforms.Compose([
    transforms.Resize(RESIZE_SIZE),
    transforms.CenterCrop(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGE_MEAN, std=IMAGE_STD),
])

In [5]:
# --- Helper function for Hugging Face Datasets ---
def make_transform(transform):
    """Wrap a torchvision transform for use with Hugging Face Datasets."""
    def apply_transform(batch):
        batch["pixel_values"] = [transform(img.convert("RGB")) for img in batch["image"]]
        return batch
    return apply_transform

# --- Apply transforms ---
train_ds = dataset["train"].with_transform(make_transform(train_transform))
val_ds   = dataset["validation"].with_transform(make_transform(val_transform))
test_ds   = dataset["test"].with_transform(make_transform(val_transform))

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class KitchenCNN(nn.Module):
    def __init__(self, num_classes):
        super(KitchenCNN, self).__init__()

        # --- Convolutional blocks ---
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)

        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)

        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)

        self.pool = nn.MaxPool2d(2, 2)

        # --- Adaptive pooling to fix flatten size ---
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))  # output: [batch, 128, 1, 1]

        # --- Fully connected layers ---
        self.fc1 = nn.Linear(128, 256)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))

        x = self.avgpool(x)
        x = torch.flatten(x, 1)  # flatten all except batch
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Example usage:
model = KitchenCNN(num_classes=10)

In [7]:
import torch
from torch.utils.data import DataLoader

def collate_fn(batch):
    pixel_values = torch.stack([x["pixel_values"] for x in batch])
    labels = torch.tensor([x["label"] for x in batch])
    return {"pixel_values": pixel_values, "label": labels}

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, num_workers=2, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=32, num_workers=2, collate_fn=collate_fn)

In [8]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

KitchenCNN(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=10, bias=True)
)

In [9]:
import torch
from torch import nn
criterion = nn.CrossEntropyLoss()

In [10]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

In [11]:
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=1)

In [12]:
batch = next(iter(train_loader))
images = batch["pixel_values"].to(device)
labels = batch["label"].to(device)
print(images.min(), images.max())

for i in range(30):
    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f"Step {i+1}: loss={loss.item():.4f}")

tensor(-1.) tensor(1.)
Step 1: loss=2.3232
Step 2: loss=2.1605
Step 3: loss=2.0609
Step 4: loss=2.0321
Step 5: loss=1.9400
Step 6: loss=1.9278
Step 7: loss=1.8790
Step 8: loss=1.7821
Step 9: loss=1.7732
Step 10: loss=1.7017
Step 11: loss=1.6572
Step 12: loss=1.6370
Step 13: loss=1.5728
Step 14: loss=1.5526
Step 15: loss=1.5439
Step 16: loss=1.4346
Step 17: loss=1.4383
Step 18: loss=1.3827
Step 19: loss=1.3940
Step 20: loss=1.2926
Step 21: loss=1.3338
Step 22: loss=1.2840
Step 23: loss=1.2749
Step 24: loss=1.1029
Step 25: loss=1.1838
Step 26: loss=1.1817
Step 27: loss=1.0684
Step 28: loss=1.0164
Step 29: loss=1.0121
Step 30: loss=0.9310


In [13]:
from tqdm import tqdm
from torch.cuda.amp import autocast

def train_one_epoch(model, dataloader, optimizer, criterion, device, scaler=None, scheduler=None, grad_clip=None, epoch=0):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Training Epoch {epoch+1}", leave=False)
    for batch_idx, (_, batch) in enumerate(pbar):
        images = batch["pixel_values"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad(set_to_none=True)
        with autocast(enabled=(scaler is not None)):
            outputs = model(images)
            loss = criterion(outputs, labels)

        if scaler:
            scaler.scale(loss).backward()
            if grad_clip:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            if grad_clip:
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()

        # <-- Move this inside the batch loop for WarmRestarts -->
        if scheduler and isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step(epoch + batch_idx / len(dataloader))

        running_loss += loss.item() * images.size(0)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    if scheduler and not isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingWarmRestarts):
        scheduler.step()

    train_loss = running_loss / total
    train_acc = 100.0 * correct / total
    return train_loss, train_acc

In [14]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

def validate(model, dataloader, criterion, device):
    model.eval()
    val_loss, all_preds, all_labels = 0.0, [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating", leave=False):
            images = batch["pixel_values"].to(device)
            labels = batch["label"].to(device)

            outputs = model(images)
            logits = outputs.logits if hasattr(outputs, "logits") else outputs
            loss = criterion(logits, labels)
            val_loss += loss.item() * images.size(0)

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_loss /= len(dataloader.dataset)
    acc = accuracy_score(all_labels, all_preds) * 100
    precision = precision_score(all_labels, all_preds, average="macro", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="macro", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)

    return val_loss, acc, precision, recall, f1

In [15]:
import numpy as np
from torch.cuda.amp import GradScaler

# ===============================================================
# 🚀 Main Training Loop with Early Stopping
# ===============================================================
def fit(
    model, train_loader, val_loader, criterion, optimizer, scheduler=None,
    device="cuda", num_epochs=20, grad_clip=1.0, early_stop_patience=5,
    use_amp=True
):
    scaler = GradScaler() if (use_amp and device.type == "cuda") else None

    best_val_loss = np.inf
    patience_counter = 0

    for epoch in range(num_epochs):
        print(f"\n🧭 Epoch [{epoch+1}/{num_epochs}]")

        train_loss, train_acc = train_one_epoch(
            model, train_loader, optimizer, criterion,
            device, scaler=scaler, scheduler=scheduler, grad_clip=grad_clip
        )

        val_loss, val_acc, val_prec, val_rec, val_f1 = validate(
            model, val_loader, criterion, device
        )

        print(
            f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
            f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | "
            f"Precision: {val_prec:.2f} | Recall: {val_rec:.2f} | F1: {val_f1:.2f}"
        )

        # Early Stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                print("⏹️ Early stopping triggered.")
                break

    print("\n✅ Training complete. Best model saved as 'best_model.pt'.")

fit(model, train_loader, val_loader, criterion, optimizer, scheduler, device, 20)


🧭 Epoch [1/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 2.2000 | Train Acc: 19.10% | Val Loss: 2.1613 | Val Acc: 18.90% | Precision: 0.27 | Recall: 0.19 | F1: 0.16

🧭 Epoch [2/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 2.1253 | Train Acc: 21.94% | Val Loss: 2.0731 | Val Acc: 24.60% | Precision: 0.24 | Recall: 0.25 | F1: 0.20

🧭 Epoch [3/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 2.0823 | Train Acc: 24.54% | Val Loss: 1.9949 | Val Acc: 28.70% | Precision: 0.27 | Recall: 0.29 | F1: 0.25

🧭 Epoch [4/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 2.0426 | Train Acc: 26.36% | Val Loss: 1.9390 | Val Acc: 31.20% | Precision: 0.33 | Recall: 0.31 | F1: 0.27

🧭 Epoch [5/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 2.0072 | Train Acc: 28.29% | Val Loss: 1.9087 | Val Acc: 32.50% | Precision: 0.31 | Recall: 0.33 | F1: 0.30

🧭 Epoch [6/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.9915 | Train Acc: 29.55% | Val Loss: 1.8781 | Val Acc: 33.20% | Precision: 0.33 | Recall: 0.33 | F1: 0.30

🧭 Epoch [7/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.9667 | Train Acc: 30.01% | Val Loss: 1.8636 | Val Acc: 33.60% | Precision: 0.35 | Recall: 0.34 | F1: 0.31

🧭 Epoch [8/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.9244 | Train Acc: 32.19% | Val Loss: 1.8810 | Val Acc: 33.30% | Precision: 0.35 | Recall: 0.33 | F1: 0.29

🧭 Epoch [9/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.9037 | Train Acc: 33.17% | Val Loss: 1.7882 | Val Acc: 35.70% | Precision: 0.36 | Recall: 0.36 | F1: 0.33

🧭 Epoch [10/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.8832 | Train Acc: 34.91% | Val Loss: 1.7902 | Val Acc: 36.60% | Precision: 0.35 | Recall: 0.37 | F1: 0.34

🧭 Epoch [11/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.8751 | Train Acc: 34.64% | Val Loss: 1.7905 | Val Acc: 37.80% | Precision: 0.40 | Recall: 0.38 | F1: 0.37

🧭 Epoch [12/20]


  with autocast(enabled=(scaler is not None)):


Train Loss: 1.8431 | Train Acc: 35.76% | Val Loss: 1.7003 | Val Acc: 42.10% | Precision: 0.39 | Recall: 0.42 | F1: 0.40

🧭 Epoch [13/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.8322 | Train Acc: 36.14% | Val Loss: 1.7810 | Val Acc: 39.00% | Precision: 0.43 | Recall: 0.39 | F1: 0.37

🧭 Epoch [14/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.8273 | Train Acc: 36.06% | Val Loss: 1.7542 | Val Acc: 39.30% | Precision: 0.39 | Recall: 0.39 | F1: 0.38

🧭 Epoch [15/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.7955 | Train Acc: 37.01% | Val Loss: 1.7232 | Val Acc: 39.10% | Precision: 0.39 | Recall: 0.39 | F1: 0.37

🧭 Epoch [16/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.7872 | Train Acc: 38.59% | Val Loss: 1.6487 | Val Acc: 43.40% | Precision: 0.40 | Recall: 0.43 | F1: 0.41

🧭 Epoch [17/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.7765 | Train Acc: 38.52% | Val Loss: 1.7032 | Val Acc: 39.90% | Precision: 0.43 | Recall: 0.40 | F1: 0.38

🧭 Epoch [18/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.7611 | Train Acc: 38.94% | Val Loss: 1.6490 | Val Acc: 43.10% | Precision: 0.45 | Recall: 0.43 | F1: 0.42

🧭 Epoch [19/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.7427 | Train Acc: 39.60% | Val Loss: 1.6611 | Val Acc: 41.80% | Precision: 0.42 | Recall: 0.42 | F1: 0.41

🧭 Epoch [20/20]


  with autocast(enabled=(scaler is not None)):
                                                           

Train Loss: 1.7363 | Train Acc: 39.30% | Val Loss: 1.6096 | Val Acc: 44.60% | Precision: 0.44 | Recall: 0.45 | F1: 0.43

✅ Training complete. Best model saved as 'best_model.pt'.




In [16]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    test_loss = 0.0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Testing"):
            inputs = batch["pixel_values"].to(device)
            labels = batch["label"].to(device)

            outputs = model(inputs)
            logits = outputs.logits if hasattr(outputs, "logits") else outputs

            loss = criterion(logits, labels)
            test_loss += loss.item() * inputs.size(0)

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = test_loss / len(dataloader.dataset)

    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")

    print(f"\n📊 Final Test Results:")
    print(f"Loss: {avg_loss:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    return all_labels, all_preds

In [17]:
test_ds = dataset["test"].with_transform(lambda e: {
    "pixel_values": [val_transform(x.convert("RGB")) for x in e["image"]],
    "label": e["label"]
})

test_loader = DataLoader(test_ds, batch_size=32)

labels, preds = evaluate(model, test_loader, criterion, device)

Testing: 100%|██████████| 32/32 [01:31<00:00,  2.86s/it]


📊 Final Test Results:
Loss: 1.6831
Accuracy: 0.4260
Precision: 0.4322
Recall: 0.4260
F1-score: 0.4125



