diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
index fc9d2d3..a03b688 100644
--- a/.github/workflows/R-CMD-check.yaml
+++ b/.github/workflows/R-CMD-check.yaml
@@ -62,7 +62,11 @@ jobs:
         if: runner.os == 'Linux'
         run: |
           sudo apt-get update
-          sudo apt-get install -y qpdf ghostscript
+          sudo apt-get install -y qpdf ghostscript graphviz
+        if: runner.os == 'macOS'
+        run: brew install graphviz
+        if: runner.os == 'Windows'
+        run: choco install graphviz -y
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
diff --git a/.gitignore b/.gitignore
index ae3d780..1a93ee2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ docs
 .httr-oauth
 .DS_Store
 .quarto
+vignettes/*_cache
\ No newline at end of file
diff --git a/R/generic_functional_fit.R b/R/generic_functional_fit.R
index bab2f70..a1e43b7 100644
--- a/R/generic_functional_fit.R
+++ b/R/generic_functional_fit.R
@@ -128,7 +128,15 @@ generic_functional_fit <- function(
 
     # --- Get Repetition Count ---
     num_repeats_arg <- paste0("num_", block_name)
-    num_repeats <- all_args[[num_repeats_arg]] %||% 1
+    num_repeats_val <- all_args[[num_repeats_arg]]
+
+    # If num_repeats_val is NULL or zapped, default to 1.
+    # Otherwise, use the value provided by the user.
+    if (is.null(num_repeats_val) || inherits(num_repeats_val, "rlang_zap")) {
+      num_repeats <- 1
+    } else {
+      num_repeats <- as.integer(num_repeats_val)
+    }
 
     # --- Get Hyperparameters for this block ---
     # Hyperparameters are formals that are NOT other block names (graph connections)
diff --git a/R/generic_sequential_fit.R b/R/generic_sequential_fit.R
index 6c0009c..f776338 100644
--- a/R/generic_sequential_fit.R
+++ b/R/generic_sequential_fit.R
@@ -118,7 +118,14 @@ generic_sequential_fit <- function(
 
     num_repeats_arg <- paste0("num_", block_name)
     num_repeats_val <- all_args[[num_repeats_arg]]
-    num_repeats <- num_repeats_val %||% 1
+
+    # If num_repeats_val is NULL or zapped, default to 1.
+    # Otherwise, use the value provided by the user.
+    if (is.null(num_repeats_val) || inherits(num_repeats_val, "rlang_zap")) {
+      num_repeats <- 1
+    } else {
+      num_repeats <- as.integer(num_repeats_val)
+    }
 
     # Get the arguments for this specific block from `...`
     block_arg_names <- names(block_fmls)[-1] # Exclude 'model'
diff --git a/R/register_fit_predict.R b/R/register_fit_predict.R
index 6b3b748..5b8eba9 100644
--- a/R/register_fit_predict.R
+++ b/R/register_fit_predict.R
@@ -57,7 +57,7 @@ register_fit_predict <- function(model_name, mode, layer_blocks, functional) {
         func = c(fun = "predict"),
         args = list(
           object = rlang::expr(object$fit$fit),
-          x = rlang::expr(as.matrix(new_data))
+          x = rlang::expr(process_x(new_data)$x_proc)
         )
       )
     )
@@ -74,7 +74,7 @@ register_fit_predict <- function(model_name, mode, layer_blocks, functional) {
         func = c(fun = "predict"),
         args = list(
           object = rlang::expr(object$fit$fit),
-          x = rlang::expr(as.matrix(new_data))
+          x = rlang::expr(process_x(new_data)$x_proc)
         )
       )
     )
@@ -89,7 +89,7 @@ register_fit_predict <- function(model_name, mode, layer_blocks, functional) {
         func = c(fun = "predict"),
         args = list(
           object = rlang::expr(object$fit$fit),
-          x = rlang::expr(as.matrix(new_data))
+          x = rlang::expr(process_x(new_data)$x_proc)
         )
       )
     )
diff --git a/R/utils.R b/R/utils.R
index 5cce55c..cb8a11b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -220,6 +220,11 @@ process_x <- function(x) {
 #' @importFrom keras3 to_categorical
 #' @noRd
 process_y <- function(y, is_classification = NULL, class_levels = NULL) {
+  # If y is a data frame/tibble, extract the first column
+  if (is.data.frame(y)) {
+    y <- y[[1]]
+  }
+
   if (is.null(is_classification)) {
     is_classification <- is.factor(y)
   }
diff --git a/_pkgdown.yml b/_pkgdown.yml
index ebce87e..c585ff1 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -17,8 +17,8 @@ guides:
   - title: "Getting Started"
     navbar: ~
     contents:
-      - getting-started
-      - functional-api
+      - getting_started
+      - functional_api
 
 # examples:
 
@@ -63,7 +63,7 @@ navbar:
   components:
     intro:
       text: "Getting started"
-      href: guides/getting-started.html
+      href: guides/getting_started.html
     github:
       icon: fa-github
       href: https://github.com/davidrsch/kerasnip
diff --git a/tests/testthat/helper-keras.R b/tests/testthat/helper_keras.R
similarity index 97%
rename from tests/testthat/helper-keras.R
rename to tests/testthat/helper_keras.R
index ce44dee..fc1aded 100644
--- a/tests/testthat/helper-keras.R
+++ b/tests/testthat/helper_keras.R
@@ -6,6 +6,7 @@ library(modeldata)
 library(rsample)
 library(dials)
 library(tune)
+library(purrr)
 
 skip_if_no_keras <- function() {
   testthat::skip_if_not_installed("keras3")
diff --git a/tests/testthat/test-e2e-classification.R b/tests/testthat/test_e2e_classification.R
similarity index 100%
rename from tests/testthat/test-e2e-classification.R
rename to tests/testthat/test_e2e_classification.R
diff --git a/tests/testthat/test-e2e-features.R b/tests/testthat/test_e2e_features.R
similarity index 92%
rename from tests/testthat/test-e2e-features.R
rename to tests/testthat/test_e2e_features.R
index 67e776e..e9d4f85 100644
--- a/tests/testthat/test-e2e-features.R
+++ b/tests/testthat/test_e2e_features.R
@@ -102,14 +102,19 @@ test_that("E2E: Customizing fit arguments works", {
   expect_lt(length(fit_obj$fit$history$metrics$loss), 5)
 })
 
-test_that("E2E: Setting num_blocks = 0 works", {
+test_that("E2E: Setting num_blocks = 0 works for sequential models", {
   skip_if_no_keras()
 
   input_block_zero <- function(model, input_shape) {
     keras3::keras_model_sequential(input_shape = input_shape)
   }
   dense_block_zero <- function(model, units = 16) {
-    model |> keras3::layer_dense(units = units, activation = "relu")
+    model |>
+      keras3::layer_dense(
+        units = units,
+        activation = "relu",
+        name = "i_should_not_exist"
+      )
   }
   output_block_zero <- function(model) {
     model |> keras3::layer_dense(units = 1)
@@ -128,10 +133,18 @@ test_that("E2E: Setting num_blocks = 0 works", {
     mode = "regression"
   )
 
-  spec <- e2e_mlp_zero(num_dense = 0, fit_epochs = 2) |>
+  spec <- e2e_mlp_zero(num_dense = 0, fit_epochs = 1) |>
     parsnip::set_engine("keras")
-  # This should fit a model with only an input and output layer
-  expect_no_error(parsnip::fit(spec, mpg ~ ., data = mtcars))
+
+  fit_obj <- parsnip::fit(spec, mpg ~ ., data = mtcars)
+
+  # Check that the dense layer is NOT in the model
+  keras_model <- fit_obj |> extract_keras_summary()
+  expect_equal(length(keras_model$layers), 1) # Output layers only
+
+  # Check layer names explicitly
+  layer_names <- sapply(keras_model$layers, function(l) l$name)
+  expect_false("i_should_not_exist" %in% layer_names)
 })
 
 test_that("E2E: Error handling for reserved names works", {
diff --git a/tests/testthat/test-e2e-functional.R b/tests/testthat/test_e2e_functional.R
similarity index 77%
rename from tests/testthat/test-e2e-functional.R
rename to tests/testthat/test_e2e_functional.R
index 5ef3fea..704cb92 100644
--- a/tests/testthat/test-e2e-functional.R
+++ b/tests/testthat/test_e2e_functional.R
@@ -182,3 +182,57 @@ test_that("E2E: Functional spec tuning (including repetition) works", {
   expect_s3_class(metrics, "tbl_df")
   expect_true(all(c("num_dense_path", "dense_path_units") %in% names(metrics)))
 })
+
+test_that("E2E: Block repetition works for functional models", {
+  skip_if_no_keras()
+
+  input_block <- function(input_shape) keras3::layer_input(shape = input_shape)
+  dense_block <- function(tensor, units = 8) {
+    tensor |> keras3::layer_dense(units = units, activation = "relu")
+  }
+  output_block <- function(tensor) keras3::layer_dense(tensor, units = 1)
+
+  model_name <- "e2e_func_repeat"
+  on.exit(suppressMessages(remove_keras_spec(model_name)), add = TRUE)
+
+  create_keras_functional_spec(
+    model_name = model_name,
+    layer_blocks = list(
+      main_input = input_block,
+      dense_path = inp_spec(dense_block, "main_input"),
+      output = inp_spec(output_block, "dense_path")
+    ),
+    mode = "regression"
+  )
+
+  # --- Test with 1 repetition ---
+  spec_1 <- e2e_func_repeat(num_dense_path = 1, fit_epochs = 1) |>
+    set_engine("keras")
+  fit_1 <- fit(spec_1, mpg ~ ., data = mtcars)
+  model_1_layers <- fit_1 |>
+    extract_keras_summary() |>
+    pluck("layers")
+
+  # Expect 3 layers: Input, Dense, Output
+  expect_equal(length(model_1_layers), 3)
+
+  # --- Test with 2 repetitions ---
+  spec_2 <- e2e_func_repeat(num_dense_path = 2, fit_epochs = 1) |>
+    set_engine("keras")
+  fit_2 <- fit(spec_2, mpg ~ ., data = mtcars)
+  model_2_layers <- fit_2 |>
+    extract_keras_summary() |>
+    pluck("layers")
+  # Expect 4 layers: Input, Dense, Dense, Output
+  expect_equal(length(model_2_layers), 4)
+
+  # --- Test with 0 repetitions ---
+  spec_3 <- e2e_func_repeat(num_dense_path = 0, fit_epochs = 1) |>
+    set_engine("keras")
+  fit_3 <- fit(spec_3, mpg ~ ., data = mtcars)
+  model_3_layers <- fit_3 |>
+    extract_keras_summary() |>
+    pluck("layers")
+  # Expect 2 layers: Input, Output
+  expect_equal(length(model_3_layers), 2)
+})
diff --git a/tests/testthat/test-e2e-multiblock-tuning.R b/tests/testthat/test_e2e_multiblock_tuning.R
similarity index 100%
rename from tests/testthat/test-e2e-multiblock-tuning.R
rename to tests/testthat/test_e2e_multiblock_tuning.R
diff --git a/tests/testthat/test-e2e-regression.R b/tests/testthat/test_e2e_regression.R
similarity index 100%
rename from tests/testthat/test-e2e-regression.R
rename to tests/testthat/test_e2e_regression.R
diff --git a/tests/testthat/test-e2e-spec-removal.R b/tests/testthat/test_e2e_spec_removal.R
similarity index 100%
rename from tests/testthat/test-e2e-spec-removal.R
rename to tests/testthat/test_e2e_spec_removal.R
diff --git a/tests/testthat/test-e2e-tuning.R b/tests/testthat/test_e2e_tuning.R
similarity index 100%
rename from tests/testthat/test-e2e-tuning.R
rename to tests/testthat/test_e2e_tuning.R
diff --git a/vignettes/functional-api.Rmd b/vignettes/functional_api.Rmd
similarity index 100%
rename from vignettes/functional-api.Rmd
rename to vignettes/functional_api.Rmd
diff --git a/vignettes/getting-started.Rmd b/vignettes/getting-started.Rmd
deleted file mode 100644
index 7ecc58d..0000000
--- a/vignettes/getting-started.Rmd
+++ /dev/null
@@ -1,243 +0,0 @@
----
-title: "Getting Started with kerasnip"
-output: rmarkdown::html_vignette
-vignette: >
-  %\VignetteIndexEntry{Getting Started with kerasnip}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
----
-
-## The Core Idea: From Keras Layers to Tidymodels Specs
-
-The `keras3` package allows for building deep learning models layer-by-layer, which is a powerful and flexible approach. However, the `tidymodels` ecosystem is designed around declarative model specifications, where you define what model you want and which of its parameters you want to tune, rather than building it imperatively.
-
-`kerasnip` bridges this gap with a simple but powerful concept: layer blocks. You define the components of your neural network (e.g., an input block, a dense block, a dropout block) as simple R functions. `kerasnip` then uses these blocks as building materials to create a brand new parsnip model specification function for you.
-
-This new function behaves just like any other parsnip model (e.g., `rand_forest()` or `linear_reg()`), making it easy to integrate into workflows and tune with tune.
-
-We’ll start by loading `kerasnip`, `tidymodels` and `keras3`:
-
-```{r load-kerasnip}
-library(kerasnip)
-library(tidymodels)
-library(keras3)
-```
-
-## Example 1: Building and Fitting a Basic MLP
-
-Let's start by building a simple Multi-Layer Perceptron (MLP) for a regression task using the `mtcars` dataset.
-
-### Step 1: Define the Layer Blocks
-
-We need three blocks:
-
--   1\. An input block to initialize the model and define the input shape. `kerasnip` will automatically pass the input_shape argument during fitting.
-
--   2\. A dense block for our hidden layers. We'll give it a units argument so we can control the number of neurons.
-
--   3\. An output block for the final prediction. For regression, this is typically a single neuron with a linear activation.
-
-    ```{r define-blocks}
-    # 1. The input block must initialize the model. 
-    # input_shape is passed automatically by the fit engine. 
-    mlp_input_block <- function(model, input_shape) {
-      keras_model_sequential(input_shape = input_shape) 
-    }
-
-    # 2. A block for hidden layers. units will become a tunable parameter. 
-    mlp_dense_block <- function(model, units = 32) {
-      model |>
-        layer_dense(units = units, activation = "relu") 
-    }
-
-    # 3. The output block for a regression model. 
-    mlp_output_block <- function(model) {
-      model |>
-        layer_dense(units = 1) 
-    } 
-    ```
-
-### Step 2: Create the Model Specification
-
-Now, we use `create_keras_sequential_spec()` to generate a new model function, which we'll call `basic_mlp()`. We provide our layer blocks in the order they should be assembled.
-
-```{r create-spec}
-create_keras_sequential_spec(
-  model_name = "basic_mlp",
-  layer_blocks = list(
-    input = mlp_input_block,
-    dense = mlp_dense_block,
-    output = mlp_output_block
-  ),
-  mode = "regression" 
-) 
-```
-
-This function call has a side-effect: a new function `basic_mlp()` is now available in our environment! Notice its arguments: `kerasnip`automatically created `num_dense` (to control the number of dense layers) and `dense_units` (from the units argument in our `mlp_dense_block`).
-
-### Step 3: Use the Spec in a Workflow
-
-We can now use `basic_mlp()` like any other parsnip model. Let's define a model with two hidden layers, each with 64 units, and train it for 50 epochs.
-
-```{r use-spec}
-spec <- basic_mlp(
-  num_dense = 2,
-  dense_units = 64,
-  fit_epochs = 50,
-  learn_rate = 0.01 
-) |>
-  set_engine("keras")
-
-print(spec) 
-```
-
-We'll use a simple recipe to normalize the predictors and combine it with our model spec in a workflow.
-
-```{r fit-model}
-# Suppress verbose Keras output for the vignette 
-options(keras.fit_verbose = 0) 
- 
-rec <- recipe(mpg ~ ., data = mtcars) |>
-  step_normalize(all_numeric_predictors())
-
-wf <- workflow() |>
-  add_recipe(rec) |>
-  add_model(spec)
-
-set.seed(123) 
-fit_obj <- fit(wf, data = mtcars) 
-```
-
-### Step 4: Make Predictions
-
-Predictions work just as you'd expect in `tidymodels`.
-
-```{r predict}
-predictions <- predict(fit_obj, new_data = mtcars[1:5, ]) 
-print(predictions)
-```
-
-## Example 2: Tuning the Model Architecture
-
-The real power of `kerasnip` comes from its ability to tune not just *hyperparameters* (like learning rate or dropout), but the *architecture* of the network itself.
-
-Let's create a more complex *tunable* specification where we let `tune` find the optimal number of dense layers, the number of units in those layers, and the rate for a final dropout layer.
-
-### Step 1: Define Blocks and Create a New Spec
-
-First, we'll define an additional block for dropout and then create a new model specification, `tunable_mlp`, that includes it.
-
-```{r define-tunable-blocks}
-tunable_dropout_block <- function(model, rate = 0.2) {
-  model |>
-    layer_dropout(rate = rate)
-}
-
-create_keras_sequential_spec(
-  model_name = "tunable_mlp",
-  layer_blocks = list(
-    input = mlp_input_block,
-    dense = mlp_dense_block,
-    dropout = tunable_dropout_block,
-    output = mlp_output_block
-  ),
-  mode = "regression"
-)
-```
-
-### Step 2: Define a Tunable Specification
-
-We use our new `tunable_mlp()` function, passing `tune()` to the arguments we want to optimize. We will have one dropout layer before the output.
-
-```{r tune-spec}
-tune_spec <- tunable_mlp(
-  num_dense = tune(),
-  dense_units = tune(),
-  num_dropout = 1,
-  dropout_rate = tune(),
-  fit_epochs = 20
-) |>
-  set_engine("keras")
-
-print(tune_spec)
-```
-
-### Step 3: Set up the Tuning Grid
-
-We create a `workflow` as before. Then, we can use helper functions from `dials` to define the search space for our parameters.
-
-```{r setup-tuning}
-tune_wf <- workflow() |>
-  add_recipe(rec) |>
-  add_model(tune_spec)
-
-# Define the tuning grid. 
-# `num_terms()` is the dials function for `num_*` parameters.
-# `hidden_units()` is the dials function for `*_units` parameters.
-params <- extract_parameter_set_dials(tune_wf) |>
-  update(
-    num_dense = dials::num_terms(c(1, 3)),
-    dense_units = dials::hidden_units(c(8, 64)),
-    dropout_rate = dials::dropout(c(0.1, 0.5))
-  )
-grid <- grid_regular(params, levels = 2) 
-print(grid) 
-```
-
-### Step 4: Run the Tuning
-
-We use `tune_grid()` with resamples to evaluate each combination of architectural parameters.
-
-```{r run-tuning, cache=TRUE}
-set.seed(456) 
-folds <- vfold_cv(mtcars, v = 3) 
- 
-# The control argument is used to prevent saving predictions, which 
-# can be large for Keras models. 
-tune_res <- tune_grid(
-  tune_wf,
-  resamples = folds,
-  grid = grid,
-  control = control_grid(save_pred = FALSE) 
-) 
-```
-
-### Step 5: Analyze the Results
-
-We can now see which architecture performed best.
-
-```{r show-best}
-show_best(tune_res, metric = "rmse")
-```
-
-The results show that `tune` has successfully tested different network depths (`num_dense`), widths (`dense_units`), and dropout rates to find the best-performing combination. This demonstrates how `kerasnip` seamlessly integrates complex architectural tuning into the standard `tidymodels` workflow.
-
-## Advanced Customization
-
-`kerasnip` provides a clean API for passing arguments directly to Keras's `compile()` and `fit()` methods.
-
--   **Compile Arguments**: Pass any argument to `keras3::compile()` by prefixing it with `compile_`. For example, to change the loss function you would use `compile_loss = "mae"`.
--   **Fit Arguments**: Pass any argument to `keras3::fit()` by prefixing it with `fit_`. For example, to set a validation split and add a callback, you would use `fit_validation_split = 0.2` and `fit_callbacks = list(...)`.
-
-Here is an example of using these arguments to specify a different loss function, a validation split, and an early stopping callback.
-
-```{r advanced-customization}
-
-adv_spec <- basic_mlp(
-  num_dense = 2,
-  dense_units = 32,
-  fit_epochs = 100,
-  # Arguments for keras3::compile()
-  compile_loss = "mae",
-  # Arguments for keras3::fit()
-  fit_validation_split = 0.2,
-  fit_callbacks = list(
-    keras3::callback_early_stopping(patience = 5)
-  )
-) |>
-  set_engine("keras")
-
-print(adv_spec)
-```
-
-This system gives you full control over the Keras training process while keeping the model specification function signature clean and focused on the *tunable* parameters.
\ No newline at end of file
diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd
new file mode 100644
index 0000000..556dec5
--- /dev/null
+++ b/vignettes/getting_started.Rmd
@@ -0,0 +1,328 @@
+---
+title: "Getting Started with kerasnip"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Getting Started with kerasnip}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+# Suppress verbose Keras output for the vignette 
+options(keras.fit_verbose = 0)
+set.seed(123)
+```
+
+## The Core Idea: From Keras Layers to Tidymodels Specs
+
+The `keras3` package allows for building deep learning models layer-by-layer, which is a powerful and flexible approach. However, the `tidymodels` ecosystem is designed around declarative model specifications, where you define *what* model you want and which of its parameters you want to tune, rather than building it imperatively.
+
+`kerasnip` bridges this gap with a simple but powerful concept: **layer blocks**. You define the components of your neural network (e.g., an input block, a dense block, a dropout block) as simple R functions. `kerasnip` then uses these blocks as building materials to create a brand new `parsnip` model specification function for you.
+
+This new function behaves just like any other `parsnip` model (e.g., `rand_forest()` or `linear_reg()`), making it easy to integrate into `tidymodels` workflows.
+
+## Installation
+
+You can install the development version of `kerasnip` from GitHub. You will also need `keras3` and a backend (like TensorFlow).
+
+```{r}
+# install.packages("pak")
+pak::pak("davidrsch/kerasnip")
+pak::pak("rstudio/keras3")
+
+# Install the backend
+keras3::install_keras()
+```
+
+We’ll start by loading `kerasnip`, `tidymodels` and `keras3`:
+
+```{r load-kerasnip}
+library(kerasnip)
+library(tidymodels)
+library(keras3)
+```
+
+## A `kerasnip` MNIST Example
+
+Let’s replicate the classic Keras introductory example, training a simple MLP on the MNIST dataset, but using the `kerasnip` workflow. This will demonstrate how to translate a standard Keras model into a reusable, modular `parsnip` specification.
+
+If you’re familiar with Keras, you’ll recognize the structure; if not, this is a perfect place to start. We’ll begin by learning the basics through a simple task: recognizing handwritten digits from the MNIST dataset.
+
+The MNIST dataset contains 28×28 pixel grayscale images of handwritten digits, like these:
+
+![MINIST](images/MNIST.png){fig-alt="A picture showing grayscale images of handwritten digits (5, 0, 4 and 1)"}
+
+Each image comes with a label indicating which digit it represents. For example, the labels for the images above might be 5, 0, 4, and 1.
+
+### Preparing the Data
+
+This step is identical to any other Keras model. We load the MNIST dataset, reshape the predictors, and convert the outcome to a factor for `tidymodels`.
+
+```{r prepare-data}
+mnist <- dataset_mnist()
+x_train <- mnist$train$x
+y_train <- mnist$train$y
+x_test <- mnist$test$x
+y_test <- mnist$test$y
+
+# Reshape
+x_train <- array_reshape(x_train, c(nrow(x_train), 784))
+x_test <- array_reshape(x_test, c(nrow(x_test), 784))
+# Rescale
+x_train <- x_train / 255
+x_test <- x_test / 255
+
+# Convert outcomes to factors for tidymodels
+# kerasnip will handle y convertion internally using keras3::to_categorical()
+y_train_factor <- factor(y_train)
+y_test_factor <- factor(y_test)
+
+# For tidymodels, it's best to work with data frames
+# Use I() to keep the matrix structure of x within the data frame
+train_df <- data.frame(x = I(x_train), y = y_train_factor)
+test_df <- data.frame(x = I(x_test), y = y_test_factor)
+```
+
+### The Standard Keras Approach (for comparison)
+
+Before diving into the `kerasnip` workflow, let's quickly look at how this same model is built using standard `keras3` code. This will help highlight the different approach `kerasnip` enables.
+
+```{r keras-standard, eval=FALSE, echo=TRUE, results='hide'}
+# The standard Keras3 approach
+model <- keras_model_sequential(input_shape = 784) |>
+  layer_dense(units = 256, activation = "relu") |>
+  layer_dropout(rate = 0.4) |>
+  layer_dense(units = 128, activation = "relu") |>
+  layer_dropout(rate = 0.3) |>
+  layer_dense(units = 10, activation = "softmax")
+
+summary(model)
+
+model |>
+  compile(
+    loss = "categorical_crossentropy",
+    optimizer = optimizer_rmsprop(),
+    metrics = "accuracy"
+  )
+
+# The model would then be trained with model |> fit(...)
+```
+
+The code above is imperative: you define each layer and add it to the model step-by-step. Now, let's see how `kerasnip` approaches this by defining reusable components for a declarative, `tidymodels`-friendly workflow.
+
+### Defining the Model with Reusable Blocks
+
+The original Keras example interleaves `layer_dense()` and `layer_dropout()`. With `kerasnip`, we can encapsulate this pattern into a single, reusable block. This makes the overall architecture cleaner and more modular.
+
+```{r define-blocks}
+# An input block to initialize the model.
+# The 'model' argument is supplied implicitly by the kerasnip backend.
+mlp_input_block <- function(model, input_shape) {
+  keras_model_sequential(input_shape = input_shape)
+}
+
+# A reusable "module" that combines a dense layer and a dropout layer.
+# All arguments that should be tunable need a default value.
+dense_dropout_block <- function(model, units = 128, rate = 0.1) {
+  model |>
+    layer_dense(units = units, activation = "relu") |>
+    layer_dropout(rate = rate)
+}
+
+# The output block for classification.
+mlp_output_block <- function(model, num_classes) {
+  model |> layer_dense(units = num_classes, activation = "softmax")
+}
+```
+
+Now, we use `create_keras_sequential_spec()` to generate our `parsnip` model function.
+
+```{r create-spec}
+create_keras_sequential_spec(
+  model_name = "mnist_mlp",
+  layer_blocks = list(
+    input = mlp_input_block,
+    hidden_1 = dense_dropout_block,
+    hidden_2 = dense_dropout_block,
+    output = mlp_output_block
+  ),
+  mode = "classification"
+)
+```
+
+### Building and Fitting the Model
+
+We can now use our new `mnist_mlp()` function. Notice how its arguments, such as `hidden_1_units` and `hidden_1_rate`, were automatically generated by `kerasnip`. The names are created by combining the name of the layer block (e.g., `hidden_1`) with the arguments of that block's function (e.g., `units`, `rate`).
+
+To replicate the `keras3` example, we'll use both `hidden` blocks and provide their parameters.
+
+```{r use-spec}
+mlp_spec <- mnist_mlp(
+  hidden_1_units = 256,
+  hidden_1_rate = 0.4,
+  hidden_2_rate = 0.3,
+  hidden_2_units =  128,
+  compile_loss = "categorical_crossentropy",
+  compile_optimizer = optimizer_rmsprop(),
+  compile_metrics = c("accuracy"),
+  fit_epochs = 30,
+  fit_batch_size = 128,
+  fit_validation_split = 0.2
+) |>
+  set_engine("keras")
+
+# Fit the model
+mlp_fit <- fit(mlp_spec, y ~ x, data = train_df)
+```
+
+```{r model-summarize}
+mlp_fit |> 
+  extract_keras_summary()
+```
+
+```{r model-plot}
+mlp_fit |> 
+  extract_keras_summary() |> 
+  plot(show_shapes = TRUE)
+```
+
+```{r model-fit-history}
+mlp_fit |> 
+  extract_keras_history() |> 
+  plot()
+```
+
+### Evaluating Model Performance
+
+The `keras_evaluate()` function provides a straightforward way to assess the model's performance on a test set, using the underlying `keras3::evaluate()` method. It returns the loss and any other metrics that were specified during the model compilation step.
+
+```{r model-evaluate}
+mlp_fit |> keras_evaluate(x_test, y_test)
+```
+
+### Making Predictions
+
+Once the model is trained, we can use the standard `tidymodels` `predict()` function to generate predictions on new data. By default, `predict()` on a `parsnip` classification model returns the predicted class labels.
+
+```{r model-predict-class}
+# Predict the class for the first 5 images in the test set 
+class_preds <- mlp_fit |>
+  predict(new_data = head(test_df))
+class_preds
+```
+
+To get the underlying probabilities for each class, we can set `type = "prob"`. This returns a tibble with a probability column for each of the 10 classes (0-9).
+
+```{r model-predict-prob}
+# Predict probabilities for the first 5 images
+prob_preds <- mlp_fit |> predict(new_data = head(test_df), type = "prob")
+prob_preds
+```
+
+We can then compare the predicted class to the actual class for these images to see how the model is performing.
+
+```{r model-predict-compare}
+# Combine predictions with actuals for comparison
+comparison <- bind_cols(
+  class_preds,
+  prob_preds
+) |>
+  bind_cols(
+    head(test_df[, "y", drop = FALSE])
+  )
+comparison
+```
+
+## Example 2: Tuning the Model Architecture
+
+Now we’ll showcase the main strength of `kerasnip`: tuning the network architecture itself. We can treat the number of layers, and the parameters of those layers, as hyperparameters to be optimized by `tune`.
+
+Using the `mnist_mlp` spec we just created, let's define a tunable model.
+
+```{r tune-spec-mnist}
+# Define a tunable specification
+# We set num_hidden_2 = 0 to disable the second hidden block for this tuning example
+tune_spec <- mnist_mlp(
+  num_hidden_1 = tune(),
+  hidden_1_units = tune(),
+  hidden_1_rate = tune(),
+  num_hidden_2 = 0,
+  compile_loss = "categorical_crossentropy",
+  compile_optimizer = optimizer_rmsprop(),
+  compile_metrics = c("accuracy"),
+  fit_epochs = 30,
+  fit_batch_size = 128,
+  fit_validation_split = 0.2
+) |>
+  set_engine("keras")
+
+# Create a workflow
+tune_wf <- workflow(y ~ x, tune_spec)
+```
+
+Next, we define the search space for our tunable parameters using `dials`.
+
+```{r create-grid-mnist}
+# Define the tuning grid
+params <- extract_parameter_set_dials(tune_wf) |>
+  update(
+    num_hidden_1 = dials::num_terms(c(1, 3)),
+    hidden_1_units = dials::hidden_units(c(64, 256)),
+    hidden_1_rate = dials::dropout(c(0.2, 0.4))
+  )
+grid <- grid_regular(params, levels = 3)
+grid
+```
+
+```{r run-tuning-mnist, cache=TRUE}
+folds <- vfold_cv(train_df, v = 3)
+
+tune_res <- tune_grid(
+  tune_wf,
+  resamples = folds,
+  grid = grid,
+  metrics = metric_set(accuracy),
+  control = control_grid(save_pred = FALSE, save_workflow = TRUE)
+)
+```
+
+Finally, we can inspect the results to find which architecture performed the best. First, a summary table:
+
+```{r show-best-mnist}
+# Show the summary table of the best models
+show_best(tune_res, metric = "accuracy")
+```
+
+Now that we've identified the best-performing hyperparameters, our final step is to create and train the final model. We use `select_best()` to get the top parameters, `finalize_workflow()` to update our workflow with them, and then `fit()` one last time on our full training dataset.
+
+```{r finalize-best-model}
+# Select the best hyperparameters
+best_hps <- select_best(tune_res, metric = "accuracy")
+
+# Finalize the workflow with the best hyperparameters
+final_wf <- finalize_workflow(tune_wf, best_hps)
+
+# Fit the final model on the full training data
+final_fit <- fit(final_wf, data = train_df)
+```
+
+We can now inspect our final, tuned model.
+
+```{r inspect-final-model}
+# Print the model summary
+final_fit |>
+  extract_fit_parsnip() |>
+  extract_keras_summary()
+
+# Plot the training history
+final_fit |> 
+  extract_fit_parsnip() |>
+  extract_keras_history() |>
+  plot()
+```
+
+This result shows that `tune` has tested various network depths, widths, and dropout rates, successfully finding the best-performing combination within the search space. By using `kerasnip`, we were able to integrate this complex architectural tuning directly into a standard `tidymodels` workflow.
\ No newline at end of file
diff --git a/vignettes/images/MNIST.png b/vignettes/images/MNIST.png
new file mode 100644
index 0000000..0a558b7
Binary files /dev/null and b/vignettes/images/MNIST.png differ