In [None]:
import sys
sys.path.append("..") # for sibling import

import pandas as pd
import walnut

# Example 3.1

### Deep neural network using sequential model

The goal of this model is to classify iris species based on numerical features.

### Step 1: Prepare data
You will need to download the dataset from https://www.kaggle.com/datasets/uciml/iris and place it into the *data* directory.

In [None]:
data_orig = pd.read_csv('../data/iris.csv')
data = data_orig.copy()
data.drop(columns=['Id'], inplace=True)
data.head()

In [None]:
data.describe()

In [None]:
data.info()

The labels are categorical values. To be used in the model, all data needs to be numerical. The function `pd_categorical_to_numeric()` can be used to one-hot-encode all categorical data of a Pandas DataFrame object.

In [None]:
data_enc = walnut.preprocessing.encoding.pd_categorical_to_numeric(data, columns=["Species"])
data_enc.sample(10)

Next the data is split into a training, validation and a testing dataset using the `split_train_test_val_data()` to evaluate the model later on. Before splitting the data is also shuffled, since sometimes raw data is sorted in some way.

In [None]:
tensor = walnut.df_to_tensor(data_enc)
t_train, t_val, t_test = walnut.preprocessing.split_train_val_test(tensor)
t_train[:5]

Features and labels are now seperated.

In [None]:
x_train, y_train = t_train[:, :-1], t_train[:, -1].astype("int")
x_val, y_val = t_val[:, :-1], t_val[:, -1].astype("int")
x_test, y_test = t_test[:, :-1], t_test[:, -1].astype("int")

In [None]:
y_train[:10]

Neural networks tend to run into problems if values are very high. Therefore it is common to normalize the data. This can be done using the `normalize()` function, which applies min-max feature scaling to a tensor.<br><br>
$ X'=a+\frac{(X-X_{min})\cdot(b-a)}{X_{max}-X_{min}} $<br><br>, where<br><br>$ a $ ... lower bound<br>$ b $ ... upper bound

In [None]:
x_train = walnut.preprocessing.normalize(x_train, axis=0)
x_val = walnut.preprocessing.normalize(x_val, axis=0)
x_test = walnut.preprocessing.normalize(x_test, axis=0)
x_train[:5]

In [None]:
print (f'{x_train.shape=}')
print (f'{y_train.shape=}')

print (f'{x_val.shape=}')
print (f'{y_val.shape=}')

print (f'{x_test.shape=}')
print (f'{y_test.shape=}')

### Step 2: Build the neural network structure
Here the individual layers of the neural network models are defined. If the weights for a layer are not definied, they are initialized randomly. For experimental purposes, they can be manually initialized using various initialization methods, such as `kaiming_normal`.

In [None]:
import walnut.nn as nn
from walnut.nn.layers import *
from walnut.nn.inits import *

n_hidden = 100
gain = get_gain("tanh")

init = normal
model = nn.Sequential([
    Linear(4, n_hidden, weights=init((4, n_hidden))), Tanh(),
    Linear(n_hidden, n_hidden, weights=init((n_hidden, n_hidden))), Tanh(),
    Linear(n_hidden, n_hidden, weights=init((n_hidden, n_hidden))), Tanh(),
    Linear(n_hidden, 3, weights=init((n_hidden, 3)))
])

# init = kaiming_normal
# model = nn.Sequential([
#     Linear(4, n_hidden, weights=init((4, n_hidden), gain)), Tanh(),
#     Linear(n_hidden, n_hidden, weights=init((n_hidden, n_hidden), gain)), Tanh(),
#     Linear(n_hidden, n_hidden, weights=init((n_hidden, n_hidden), gain)), Tanh(),
#     Linear(n_hidden, 3, weights=init((n_hidden, 3)))
# ])

The network is compiled to finalize the model. Besides the SGD optimizer, the framework also provides other algorithms like Adam. There are also multiple loss functions to choose from. Since this example explores a classification problem, the cross entropy loss function is used.

In [None]:
model.compile(
    optimizer=nn.optimizers.SGD(),
    loss_fn=nn.losses.Crossentropy(),
    metric=nn.metrics.get_accuracy
)

In [None]:
from walnut.nn.analysis import model_summary
model_summary(model, (4,))

### Step 3: Train the model

In [None]:
train_loss_hist, val_loss_hist = model.train(x_train, y_train, epochs=100, val_data=(x_val, y_val), verbose=False)

In [None]:
traces = {
    "train_loss" : train_loss_hist,
    "val_loss" : val_loss_hist
}

nn.analysis.plot_curve(traces=traces, figsize=(15, 3), title="loss history", x_label="epoch", y_label="loss")

### Step 4: Evaluate the model
Using the defined metric, the model's performance can be evaluated using testing/validation data.

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'loss {loss:.4f}')
print(f'accuracy {100*accuracy:.2f}')

### Step 5: Analyze the model
Usind different plots, the models performance and training behaviour can be analyzed (eg. checking for overfitting)

If the `normal` weight initialization method is used, the **tanh** activations get saturated very fast and the gradients "die out". If other initializers, such as `kaiming_normal` are used, this couteracts this behaviour. Furthermore the initial loss is lower and the model is therefore not wasting time correcting unnecessary high weight values in the beginning (Analysis inspired by Andrej Karpathy - highly recommend checking out his videos on YouTube)

In [None]:
activations = {f"{i + 1} {l.__class__.__name__}" : l.y.data.copy() for i, l in enumerate(model.layers[0].layers) if l.__class__.__name__ == "Tanh"}
nn.analysis.plot_distrbution(activations, figsize=(15, 3), title="activation distribution", bins=200)

As mentioned, the gradient of saturated neurons get very close to zero. If that happens for all batches, then the neuron is not learning and it is considererd dead (white pixels in the plot). By using the Kaiming He initialization method this can be reduced.

In [None]:
saturations = {f"{i + 1} {l.__class__.__name__}" : (l.y.abs() > 0.99).data for i, l in enumerate(model.layers) if l.__class__.__name__ == "Tanh"}
nn.analysis.plot_images(saturations, (150, 30), "gray", plot_axis=True)

In [None]:
weight_gradients = {f"{i + 1} {l.__class__.__name__}" : l.w.grad for i, l in enumerate(model.layers[0].layers) if l.__class__.__name__ == "Linear"}
nn.analysis.plot_distrbution(weight_gradients, figsize=(15, 3), title=" weight gradient distribution", bins=200)

In [None]:
act_gradients = {f"{i + 1} {l.__class__.__name__}" : l.y.grad for i, l in enumerate(model.layers[0].layers) if l.__class__.__name__ == "Tanh"}
nn.analysis.plot_distrbution(act_gradients, figsize=(15, 3), title="activation gradient distribution", bins=200)