In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Recurrent neural network

- used with sequential data (time series, sentences...)

- parameters: input weights $\mathbf{W}_i$, hidden weights $\mathbf{W}_h$ and output weights $\mathbf{W}_o$

# Forward pass

$$\mathbf{z}_t = \mathbf{h}_{t-1} \cdot \mathbf{W}_h + \mathbf{x} \cdot \mathbf{W}_i$$
$$\mathbf{h}_{t} = \phi(\mathbf{z}_t)$$
$$\mathbf{y}_t = \mathbf{h}_{t} \cdot \mathbf{W}_o$$

# Backward pass

$$\frac{\partial X_i}{\partial X} = W_i$$
$$\frac{\partial X_i}{\partial W_i} = X$$
$$\frac{\partial X_h(t)}{\partial X_i} = \frac{\partial \phi}{\partial Z} \cdot \frac{\partial Z}{\partial X_i} = \frac{\partial \phi}{\partial Z} \cdot 1$$
$$\frac{\partial X_h(t)}{\partial W_h} = \frac{\partial \phi}{\partial Z} \cdot \frac{\partial Z}{\partial W_h} = \frac{\partial \phi}{\partial Z} \cdot X_h(t-1)$$
$$\frac{\partial X_h(t)}{\partial X_h(t-1)} = \frac{\partial \phi}{\partial Z} \cdot \frac{\partial Z}{\partial X_h(t-1)} = \frac{\partial \phi}{\partial Z} \cdot W_h$$
$$\frac{\partial Y}{\partial X_h(t)} = W_0$$
$$\frac{\partial Y}{\partial W_o} = X_h(t)$$

***

$$ 
\begin{align*}
\frac{\partial L}{\partial W_o} &= \frac{\partial L}{\partial Y} \cdot \frac{\partial Y}{\partial W_o} \\
&= \frac{\partial L}{\partial Y} \cdot X_h(t)
\end{align*}
$$

$$ 
\begin{align*}
\frac{\partial L}{\partial W_h} &= \frac{\partial L}{\partial Y} \cdot \frac{\partial Y}{\partial X_h(t)} \cdot \frac{\partial X_h(t)}{\partial W_h} \\
&= \frac{\partial L}{\partial Y} \cdot W_o \cdot \frac{\partial \phi}{\partial Z} \cdot X_h(t-1)
\end{align*}
$$

$$ 
\begin{align*}
\frac{\partial L}{\partial W_i} &= \frac{\partial L}{\partial Y} \cdot \frac{\partial Y}{\partial X_h(t)} \cdot \frac{\partial X_h(t)}{\partial X_i} \cdot \frac{\partial X_i}{\partial W_i} \\
&= \frac{\partial L}{\partial Y} \cdot W_o \cdot \frac{\partial \phi}{\partial Z} \cdot 1 \cdot X
\end{align*}
$$

$$ 
\begin{align*}
\frac{\partial L}{\partial X} &= \frac{\partial L}{\partial Y} \cdot \frac{\partial Y}{\partial X_h(t)} \cdot \frac{\partial X_h(t)}{\partial X_i} \cdot \frac{\partial X_i}{\partial X} \\
&= \frac{\partial L}{\partial Y} \cdot W_o \cdot \frac{\partial \phi}{\partial Z} \cdot 1 \cdot W_i
\end{align*}
$$

In [2]:
class RecurrentLayer:

    def __init__(self, n_inputs: int, n_hidden: int, n_outputs: int):
        k = 1/np.sqrt(n_hidden)
        self.n_hidden = n_hidden
        self.input_weights = np.random.rand(n_inputs, n_hidden) * 2 * k - k
        self.hidden_weights = np.random.rand(n_hidden, n_hidden) * 2 * k - k
        self.output_weights = np.random.rand(n_hidden, n_outputs) * 2 * k - k
        self.output_bias = np.random.rand(n_outputs) * 2 * k - k
        self.hidden_biases = np.random.rand(n_hidden) * 2 * k - k
        
    def forward(self, inputs: np.ndarray):
        self.inputs = inputs
        self.n_samples = inputs.shape[0]
        self.output = np.zeros((self.n_samples, self.output_weights.shape[1]))
        self.hidden_states = np.zeros((self.n_samples, self.n_hidden))

        for idx, x in enumerate(inputs):

            x = x.reshape(1, -1)

            input_x = np.dot(x, self.input_weights)

            hidden_x = input_x + np.dot(self.hidden_states[max(idx-1, 0)], self.hidden_weights) + self.hidden_biases
            hidden_x = np.tanh(hidden_x)
            self.hidden_states[idx] = hidden_x.copy()

            output_x = np.dot(hidden_x, self.output_weights) + self.output_bias
            self.output[idx] = output_x.copy()

    def backward(self, delta: np.ndarray):

        self.dinput_weights = np.zeros_like(self.input_weights)
        self.dhidden_weights = np.zeros_like(self.hidden_weights)
        self.dhidden_biases = np.zeros_like(self.hidden_biases)
        self.doutput_weights = np.zeros_like(self.output_weights)
        self.doutput_bias = np.zeros_like(self.output_bias)
        self.dinputs = np.zeros_like(self.inputs, dtype=np.float64)
        next_hidden = None

        for i in range(self.n_samples - 1, -1, -1):

            loss_gradient = delta[i].reshape(1, -1)
            hidden_state = self.hidden_states[i].reshape(-1, 1)

            self.doutput_weights += np.dot(hidden_state, loss_gradient)
            self.doutput_bias += loss_gradient.reshape(-1)

            hidden_gradient = np.dot(loss_gradient, self.output_weights.T)
            if next_hidden is not None:
                hidden_gradient += np.dot(next_hidden, self.hidden_weights.T)

            dtanh = 1 - self.hidden_states[i]**2
            hidden_gradient *= dtanh

            next_hidden = hidden_gradient.copy()

            if i > 0:
                self.dhidden_weights += np.dot(self.hidden_states[i-1].reshape(-1, 1), hidden_gradient) #self.hidden_states[i - 1].reshape(-1, 1) @ h_grad
                self.dhidden_biases += hidden_gradient.reshape(-1)
                self.dinput_weights += np.dot(self.inputs[i].reshape(-1, 1), hidden_gradient) #self.inputs[i].reshape(-1, 1) @ h_grad

            self.dinputs[i] += np.dot(self.input_weights, hidden_gradient.T).reshape(-1) #(self.input_weights @ h_grad.T).item(0)

In [3]:
def mse_grad(y_pred, y_true):
    return (y_pred - y_true)

def mse_loss(y_pred, y_true):
    return np.mean(0.5 * (y_pred - y_true)**2)

In [4]:
def update_recurrent_layer(layer, lr, momentum=0.9):
    """layer.input_weights += -lr * layer.dinput_weights
    layer.hidden_weights += -lr * layer.dhidden_weights
    layer.hidden_biases += -lr * layer.dhidden_biases
    layer.output_weights += -lr * layer.doutput_weights
    layer.output_bias += -lr * layer.doutput_bias"""

    if momentum:

        if not hasattr(layer, 'input_weights_momentum'):
            layer.input_weights_momentum = np.zeros_like(layer.input_weights)
            layer.hidden_weights_momentum = np.zeros_like(layer.hidden_weights)
            layer.output_weights_momentum = np.zeros_like(layer.output_weights)
            layer.output_bias_momentum = np.zeros_like(layer.output_bias)
            layer.hidden_biases_momentum = np.zeros_like(layer.hidden_biases)

        layer.input_weights_momentum = momentum * layer.input_weights_momentum - lr * layer.dinput_weights
        dinput_weights_updates = layer.input_weights_momentum

        layer.hidden_weights_momentum = momentum * layer.hidden_weights_momentum - lr * layer.dhidden_weights
        dhidden_weights_updates = layer.hidden_weights_momentum

        layer.output_weights_momentum = momentum * layer.output_weights_momentum - lr * layer.doutput_weights
        doutput_weights_updates = layer.output_weights_momentum

        layer.output_bias_momentum = momentum * layer.output_bias_momentum - lr * layer.doutput_bias
        doutput_bias_updates = layer.output_bias_momentum

        layer.hidden_biases_momentum = momentum * layer.hidden_biases_momentum - lr * layer.dhidden_biases
        dhidden_biases_updates = layer.hidden_biases_momentum


    else:

        dinput_weights_updates = - lr * layer.dinput_weights

        dhidden_weights_updates =  -lr * layer.dhidden_weights

        doutput_weights_updates = - lr * layer.doutput_weights

        doutput_bias_updates = - lr * layer.doutput_bias

        dhidden_biases_updates = - lr * layer.dhidden_biases

    layer.input_weights += dinput_weights_updates
    layer.hidden_weights += dhidden_weights_updates
    layer.hidden_biases += dhidden_biases_updates
    layer.output_weights += doutput_weights_updates
    layer.output_bias += doutput_bias_updates

In [5]:
# seed 0, lr=5e-4, backward konvergira
from sklearn.preprocessing import StandardScaler

#np.random.seed(0)

rec11 = RecurrentLayer(1, 5, 1)
rec21 = RecurrentLayer(1, 7, 1)

rec12 = RecurrentLayer(1, 5, 1)
rec22 = RecurrentLayer(1, 7, 1)

sequence = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).reshape(-1, 1)
scaler = StandardScaler()
sequence = scaler.fit_transform(sequence)
result = np.array([60, 52, 52, 53, 52, 50, 52, 56, 54, 57]).reshape(-1, 1)
lr = 5e-3
seq_len = 3

for i in range(500):

    for j in range(len(sequence)-seq_len):

        rec11.forward(sequence[j:j+seq_len, :])
        rec21.forward(rec11.output)

        rec12.forward(sequence[j:j+seq_len, :])
        rec22.forward(rec12.output)

        loss_grad_1 = mse_grad(rec21.output.reshape(-1, 1), result[j:j+seq_len, :])
        loss_grad_2 = mse_grad(rec22.output.reshape(-1, 1), result[j:j+seq_len, :])

        rec21.backward(loss_grad_1)
        rec11.backward(rec21.dinputs)

        rec22.backward(loss_grad_2)
        rec12.backward(rec22.dinputs)

        """if i % 1000 == 0:
            norme11 = []
            norme12 = []
            norme21 = []
            norme22 = []
            for l in [rec11.input_weights, 
                    rec11.hidden_weights, 
                    rec11.output_weights, 
                    rec11.output_bias, 
                    rec11.hidden_biases, 
                    rec11.hidden_states]:
                norme11.append(np.linalg.norm(l))

            for l in [rec12.input_weights, 
                    rec12.hidden_weights, 
                    rec12.output_weights, 
                    rec12.output_bias, 
                    rec12.hidden_biases, 
                    rec12.hidden_states]:
                norme12.append(np.linalg.norm(l))

            for l in [rec21.input_weights, 
                    rec21.hidden_weights, 
                    rec21.output_weights, 
                    rec21.output_bias, 
                    rec21.hidden_biases, 
                    rec21.hidden_states]:
                norme21.append(np.linalg.norm(l))

            for l in [rec22.input_weights, 
                    rec22.hidden_weights, 
                    rec22.output_weights, 
                    rec22.output_bias, 
                    rec22.hidden_biases, 
                    rec22.hidden_states]:
                norme22.append(np.linalg.norm(l))

            norme11 = np.array(norme11)
            norme12 = np.array(norme12)
            norme21 = np.array(norme21)
            norme22 = np.array(norme22)

            print(f'Razlike 1: {np.abs(norme11 - norme12)}')
            print(f'Razlike 2: {np.abs(norme21 - norme22)}')"""

        update_recurrent_layer(rec11, lr, momentum=0)
        update_recurrent_layer(rec12, lr, momentum=0)
        update_recurrent_layer(rec21, lr, momentum=0)
        update_recurrent_layer(rec22, lr, momentum=0)

    #print(f'dInput weights: {np.linalg.norm(rec11.dinput_weights)}, dhidden weights: {np.linalg.norm(rec11.dhidden_weights)}, doutput weights: {np.linalg.norm(rec11.doutput_weights)}')
    #print(f'dInput weights: {np.linalg.norm(rec12.dinput_weights)}, dhidden weights: {np.linalg.norm(rec12.dhidden_weights)}, doutput weights: {np.linalg.norm(rec12.doutput_weights)}')

    if i % 100 == 0:
        print(f'Loss 1: {mse_loss(rec21.output.reshape(-1, 1), result[j:j+seq_len, :])}')
        print(f'Loss 2: {mse_loss(rec22.output.reshape(-1, 1), result[j:j+seq_len, :])}')

rec11.forward(sequence)
rec21.forward(rec11.output)
print(rec21.output.reshape(-1))
print(result.reshape(-1))

rec12.forward(sequence)
rec22.forward(rec12.output)
print(rec22.output.reshape(-1))
print(result.reshape(-1))

Loss 1: 568.4424851446765
Loss 2: 628.9403008649464
Loss 1: 2.035433117532382
Loss 2: 1.7851348236870457
Loss 1: 2.3172628489620677
Loss 2: 0.28974729190130327
Loss 1: 2.2939677823039615
Loss 2: 0.2786899329699611
Loss 1: 2.23750998051382
Loss 2: 0.05414823335795991
[52.26466962 52.89462815 52.92358133 52.93036208 52.94258359 52.94787717
 52.95245291 52.9545557  52.95594964 52.95654465]
[60 52 52 53 52 50 52 56 54 57]
[52.16317868 52.18469649 52.18464374 52.1845776  52.18445693 52.18414309
 52.18290263 52.22170328 57.37013405 53.28353593]
[60 52 52 53 52 50 52 56 54 57]


In [6]:
import pandas as pd

data = pd.read_csv('clean_weather.csv', names=['date', 'tmax', 'tmin', 'rain', 'tmax_tomorrow'], header=0)
data

Unnamed: 0,date,tmax,tmin,rain,tmax_tomorrow
0,1970-01-01,60.0,35.0,0.0,52.0
1,1970-01-02,52.0,39.0,0.0,52.0
2,1970-01-03,52.0,35.0,0.0,53.0
3,1970-01-04,53.0,36.0,0.0,52.0
4,1970-01-05,52.0,35.0,0.0,50.0
...,...,...,...,...,...
13504,2022-11-22,62.0,35.0,0.0,67.0
13505,2022-11-23,67.0,38.0,0.0,66.0
13506,2022-11-24,66.0,41.0,0.0,70.0
13507,2022-11-25,70.0,39.0,0.0,62.0


In [7]:
from sklearn.preprocessing import StandardScaler

FEATURES = ['tmax', 'tmin', 'rain']
TARGET = 'tmax_tomorrow'

X = data[FEATURES].to_numpy()
y = data[TARGET].to_numpy()

scaler = StandardScaler()
X = scaler.fit_transform(X)

print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (13509, 3)
y: (13509,)


In [8]:
threshold = 0.05

X_train = X[:int(threshold*len(X)),:].copy()
y_train = y[:int(threshold*len(X))].copy()

X_test = X[int(threshold*len(X)):,:].copy()
y_test = y[int(threshold*len(X)):].copy()

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')

X_train: (675, 3), y_train: (675,)
X_test: (12834, 3), y_test: (12834,)


In [9]:
np.random.seed(0)
rec1 = RecurrentLayer(3, 4, 1)

lr = 1e-3
seq_len = 7

for i in range(51):

    epoch_loss = 0

    for j in range(len(X_train) - seq_len):

        rec1.forward(X_train[j:j+seq_len, :])

        epoch_loss += mse_loss(rec1.output, y_train[j:j+seq_len])

        loss_grad = mse_grad(rec1.output.reshape(-1), y_train[j:j+seq_len])

        rec1.backward(loss_grad)

        update_recurrent_layer(rec1, lr, momentum=0.)

    if i % 10 == 0:
        print(f'Epoch loss {i}: {epoch_loss / len(X_train)}')

    print(f'dInput weights: {np.linalg.norm(rec1.dinput_weights)}, dhidden weights: {np.linalg.norm(rec1.dhidden_weights)}, doutput weights: {np.linalg.norm(rec1.doutput_weights)}')

Epoch loss 0: 70.86887843368365
dInput weights: 0.22876374089578788, dhidden weights: 0.34597162404873794, doutput weights: 35.200825515898245
dInput weights: 0.3592668113932815, dhidden weights: 0.5234197358040739, doutput weights: 31.58690859835022
dInput weights: 2.1543973605988183, dhidden weights: 2.65290353262858, doutput weights: 24.623665385862484
dInput weights: 361.74004322415715, dhidden weights: 519.472237586879, doutput weights: 71.50236106670708
dInput weights: 334.1185205370646, dhidden weights: 468.6662874727242, doutput weights: 83.23951245937046
dInput weights: 253.28671419489206, dhidden weights: 355.9622585024941, doutput weights: 70.09075449646852
dInput weights: 220.0948881176314, dhidden weights: 309.7607926087972, doutput weights: 64.10724901281382
dInput weights: 196.19478151629727, dhidden weights: 276.33329595731726, doutput weights: 59.42537943163376
dInput weights: 177.52280556260555, dhidden weights: 249.19489130390892, doutput weights: 54.53404332055495
d

In [21]:
from random import randint

start = randint(0, len(X_test))
seq_len = 3
rec1.forward(X_test[start:start+seq_len, :])
print(rec1.output.reshape(-1))
print(y_test.reshape(-1)[start:start+seq_len])
print(f'Loss: {mse_loss(rec1.output.reshape(-1), y_test.reshape(-1)[start:start+seq_len])}')

[71.13724776 65.35577346 63.90170849]
[69. 65. 64.]
Loss: 0.7840106602377063
