In [5]:
# ====================================================
# End-to-End Pipeline Notebook
# ====================================================

# --- 0️⃣ Imports ---
import torch
from best_library import (
    LoadData, Preprocessing, FeatureBuilder, DatasetSplitter,
    ModelBuilder, Trainer, Predictor, HyperparameterTuner, Evaluator
)

# --- 1️⃣ Configuration ---
DATASET_DIR = "../dataset"   # raw dataset
WORK_DIR = "../data"         # working directory for split dataset
BATCH_SIZE = 16
LR = 1e-4
EPOCHS = 5
IMG_SIZE = 224
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAVE_PATH = "../models/best_model.pth"

# ====================================================
# a) Preprocessing
# ====================================================
preprocessor = Preprocessing(img_size=IMG_SIZE)
transform = preprocessor.get_transform()

# ====================================================
# b) Feature building
# ====================================================
feature_builder = FeatureBuilder()
# Example usage:
# for images, labels in train_loader:
#     feats = feature_builder.extract_features(images[0])

# ====================================================
# c) Split dataset
# ====================================================
splitter = DatasetSplitter(DATASET_DIR, WORK_DIR, train_ratio=0.8)
splitter.split()

# ====================================================
# 1️⃣ Load datasets with DataLoaders
# ====================================================
loader = LoadData(WORK_DIR, transform=transform)
train_loader, val_loader, class_names = loader.load_and_split(batch_size=BATCH_SIZE)

# ====================================================
# d) Build and train the first model
# ====================================================
num_classes = len(class_names)
model_builder = ModelBuilder(DEVICE, num_classes=num_classes)
model = model_builder.build()

trainer = Trainer(DEVICE)
trainer.train(model, train_loader, val_loader, epochs=EPOCHS, lr=LR)

# ====================================================
# e) Hyperparameter tuning
# ====================================================
param_grid = {
    "lr": [1e-3, 1e-4],
    "epochs": [3, 5]
}

tuner = HyperparameterTuner(param_grid, DEVICE)
best_params, best_acc = tuner.tune(train_loader, val_loader, save_path=SAVE_PATH)
print(f"Best hyperparameters: {best_params}, Best validation accuracy: {best_acc:.3f}")

# ====================================================
# f) Evaluate model
# ====================================================
evaluator = Evaluator(DEVICE)
best_model = Predictor(DEVICE, class_names).load_model(SAVE_PATH)
val_accuracy = evaluator.evaluate(best_model, val_loader)
print(f"Validation accuracy of the best model: {val_accuracy:.3f}")

# ====================================================
# g) Predict new images (optional)
# ====================================================
# predictor = Predictor(DEVICE, class_names)
# image_path = "../dataset/test/alpaca_01.jpg"
# label, confidence = predictor.predict(image_path, best_model, transform)
# print(f"Predicted label: {label}, Confidence: {confidence:.3f}")


ModuleNotFoundError: No module named 'best_library'

# Final Project CFDS - Pipeline & Tuning

This notebook demonstrates how to use the `best_library` to run the end-to-end machine learning pipeline and perform hyperparameter tuning.

In [None]:
import sys
import os

# Add the parent directory to sys.path to access src
sys.path.append(os.path.abspath('..'))

In [None]:
import torch
from best_library import LoadData, Preprocessing, FeatureBuilder, DatasetSplitter, ModelBuilder, Trainer, Predictor, HyperparameterTuner, Evaluator

## 1. Standard Pipeline Execution

Here we define our configuration, split the data, compute statistics, and train a baseline model.

In [None]:
# Configuration
# Note: Paths are relative to the notebook location
DATASET_DIR = "../dataset"
WORK_DIR = "../data"
BATCH_SIZE = 16
LR = 1e-4
EPOCHS = 5
IMG_SIZE = 224

In [None]:
# --- 1️⃣ Load data ---
loader = LoadData(WORK_DIR, transform=None)  # transform encara no aplicat
train_loader, val_loader, class_names = loader.load_and_split(batch_size=BATCH_SIZE)



In [None]:
# 1. Split Data
split_dataset(DATASET_DIR, WORK_DIR)

In [None]:
# 2. Feature Engineering (Compute Stats)
train_dir = os.path.join(WORK_DIR, "train")
if os.path.exists(train_dir):
    compute_dataset_stats(train_dir, img_size=IMG_SIZE)

In [None]:
# 3. Preprocessing & Data Loading
preprocessing = Preprocessing(img_size=IMG_SIZE)
transform = preprocessing.get_transform()

train_loader, val_loader, class_names = load_data(WORK_DIR, BATCH_SIZE, transform)

In [None]:
# 4. Build & Train Model
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = build_model(device, num_classes=len(class_names))

train_model(model, train_loader, val_loader, EPOCHS, LR, device, save_path="../alpaca_classifier_notebook.pt")

## 2. Hyperparameter Tuning

Now we use the `HyperparameterTuner` to find the best configuration.

In [None]:
param_grid = {
    'lr': [1e-3, 1e-4],
    'batch_size': [8, 16],
    'epochs': [3] # Keeping it short for demo
}

print("Initializing Tuner...")
tuner = HyperparameterTuner(WORK_DIR, param_grid, img_size=IMG_SIZE)

best_params, best_acc = tuner.tune()

print(f"Optimization finished! Best Params: {best_params}")