In [27]:
import numpy as np
import matplotlib.pyplot as plt

# Recurrent neural network

- used with sequential data (time series, sentences...)

- parameters: input weights $\mathbf{W}_i$, hidden weights $\mathbf{W}_h$ and output weights $\mathbf{W}_o$

# Forward pass

$$\mathbf{z}_t = \mathbf{h}_{t-1} \cdot \mathbf{W}_h + \mathbf{x}_t \cdot \mathbf{W}_i + \mathbf{b}_i$$
$$\mathbf{h}_{t} = \phi(\mathbf{z}_t)$$
$$\mathbf{y}_t = \mathbf{h}_{t} \cdot \mathbf{W}_o + \mathbf{b}_o$$

# Gradients

$$\frac{\partial \mathbf{z}_t}{\partial \mathbf{h}_{t-1}} = \mathbf{W}_h \quad \quad \frac{\partial \mathbf{z}_t}{\partial \mathbf{W}_h} = \mathbf{h}_{t-1} \quad \quad \frac{\partial \mathbf{z}_t}{\partial \mathbf{x}} = \mathbf{W}_i \quad \quad \frac{\partial \mathbf{z}_t}{\partial \mathbf{W}_i} = \mathbf{x} \quad \quad \frac{\partial \mathbf{z}_t}{\partial \mathbf{b}_i} = 1$$

$$\frac{\partial \mathbf{h}_t}{\partial \mathbf{z}_t} = \phi'(\mathbf{z}_t)$$

$$\frac{\partial \mathbf{y}_t}{\partial \mathbf{h}_{t}} = \mathbf{W}_o \quad \quad \frac{\partial \mathbf{y}_t}{\partial \mathbf{W}_o} = \mathbf{h}_{t} \quad \quad \frac{\partial \mathbf{y}_t}{\partial \mathbf{b}_o} = 1$$

# Backward pass


$$
\begin{align*}

\begin{aligned}
\frac{\partial \mathcal{L}_t}{\partial \mathbf{W}_o} &= \frac{\partial \mathcal{L}_t}{\partial \mathbf{y}_t} \cdot \frac{\partial \mathbf{y}_t}{\partial \mathbf{W}_o} \\
&= \delta_t \cdot \mathbf{h}_t
\end{aligned} \quad \quad

\begin{aligned}
\frac{\partial \mathcal{L}}{\partial \mathbf{W}_o} = \sum_t \frac{\partial \mathcal{L}_t}{\partial \mathbf{W}_o}
\end{aligned}

\end{align*}
$$

$$
\begin{align*}


\begin{aligned}

\frac{\partial \mathcal{L}_t}{\partial \mathbf{b}_o} &= \frac{\partial \mathcal{L}_t}{\partial \mathbf{y}_t} \cdot \frac{\partial \mathbf{y}_t}{\partial \mathbf{b}_o} \\
&= \delta_t \cdot 1 \\
&= \delta_t

\end{aligned} \quad \quad


\begin{aligned}

\frac{\partial \mathcal{L}}{\partial \mathbf{b}_o} = \sum_t \frac{\partial \mathcal{L}_t}{\partial \mathbf{b}_o}

\end{aligned}


\end{align*}
$$

$$ 

\begin{align*}

\begin{aligned}

\frac{\partial \mathcal{L}_t}{\partial \mathbf{W}_h} &= \left( \frac{\partial \mathcal{L}_t}{\partial \mathbf{y}_t} \cdot \frac{\partial \mathbf{y}_t}{\partial \mathbf{h}_{t}} + \mathbf{g}_{t+1} \cdot \frac{\partial \mathbf{z}_{t+1}}{\partial \mathbf{h}_{t}} \right) \cdot \frac{\partial \mathbf{h}_t}{\partial \mathbf{z}_t} \cdot \frac{\partial \mathbf{z}_t}{\partial \mathbf{W}_h} \\

&= \left( \delta_t \cdot \mathbf{W}_o + \mathbf{g}_{t+1} \cdot \mathbf{W}_h \right) \cdot \phi'(\mathbf{z}_t)\cdot \mathbf{h}_{t-1} \\

&= \left[\mathbf{d}_t \odot \phi'(\mathbf{z}_t) \right] \cdot \mathbf{h}_{t-1} \\

&= \mathbf{g}_t \cdot \mathbf{h}_{t-1}

\end{aligned} \quad \quad

\begin{aligned}
\frac{\partial \mathcal{L}}{\partial \mathbf{W}_h} = \sum_t \frac{\partial \mathcal{L}_t}{\partial \mathbf{W}_h}
\end{aligned}

\end{align*}
$$

$$ 

\begin{align*}

\begin{aligned}

\frac{\partial \mathcal{L}_t}{\partial \mathbf{W}_i} &= \left( \frac{\partial \mathcal{L}_t}{\partial \mathbf{y}_t} \cdot \frac{\partial \mathbf{y}_t}{\partial \mathbf{h}_{t}} + \mathbf{g}_{t+1} \cdot \frac{\partial \mathbf{z}_{t+1}}{\partial \mathbf{h}_{t}} \right) \cdot \frac{\partial \mathbf{h}_t}{\partial \mathbf{z}_t} \cdot \frac{\partial \mathbf{z}_t}{\partial \mathbf{W}_i} \\

&= \left( \delta_t \cdot \mathbf{W}_o + \mathbf{g}_{t+1} \cdot \mathbf{W}_h \right) \cdot \phi'(\mathbf{z}_t)\cdot \mathbf{x}_{t} \\

&= \left[\mathbf{d}_t \odot \phi'(\mathbf{z}_t) \right] \cdot \mathbf{x}_{t} \\

&= \mathbf{g}_t \cdot \mathbf{x}_{t}

\end{aligned} \quad \quad

\begin{aligned}
\frac{\partial \mathcal{L}}{\partial \mathbf{W}_i} = \sum_t \frac{\partial \mathcal{L}_t}{\partial \mathbf{W}_i}
\end{aligned}

\end{align*}
$$

$$ 

\begin{align*}

\begin{aligned}

\frac{\partial \mathcal{L}_t}{\partial \mathbf{b}_i} &= \left( \frac{\partial \mathcal{L}_t}{\partial \mathbf{y}_t} \cdot \frac{\partial \mathbf{y}_t}{\partial \mathbf{h}_{t}} + \mathbf{g}_{t+1} \cdot \frac{\partial \mathbf{z}_{t+1}}{\partial \mathbf{h}_{t}} \right) \cdot \frac{\partial \mathbf{h}_t}{\partial \mathbf{z}_t} \cdot \frac{\partial \mathbf{z}_t}{\partial \mathbf{b}_i} \\

&= \left( \delta_t \cdot \mathbf{W}_o + \mathbf{g}_{t+1} \cdot \mathbf{W}_h \right) \cdot \phi'(\mathbf{z}_t)\cdot 1 \\

&= \mathbf{d}_t \odot \phi'(\mathbf{z}_t) \\

&= \mathbf{g}_t

\end{aligned} \quad \quad

\begin{aligned}
\frac{\partial \mathcal{L}}{\partial \mathbf{b}_i} = \sum_t \frac{\partial \mathcal{L}_t}{\partial \mathbf{b}_i}
\end{aligned}

\end{align*}
$$

$$ \frac{1}{2} $$

In [28]:
from dlfs.base import Layer

class RecurrentLayer(Layer):

    def __init__(self, n_inputs: int, n_hidden: int, n_outputs: int) -> None:
        """
        Recurrent layer.

        Parameters
        ----------
        n_inputs : int
            Number of input features.

        n_hidden : int
            Number of hidden features.

        n_outputs : int
            Number of output features.

        Attributes
        ----------
        input_weights : numpy.ndarray
            Matrix of input weight coefficients.

        hidden_weights : numpy.ndarray
            Matrix of hidden weight coefficients.

        output_weights : numpy.ndarray
            Matrix of output weight coefficients.

        input_bias : numpy.ndaray
            Vector of input bias coefficients.

        output_bias : numpy.ndaray
            Vector of output bias coefficients.
        """
        k = 1 / np.sqrt(n_hidden)
        self.n_hidden = n_hidden
        self.input_weights = np.random.uniform(-k, k, (n_inputs, n_hidden))
        self.hidden_weights = np.random.uniform(-k, k, (n_hidden, n_hidden))
        self.output_weights = np.random.uniform(-k, k, (n_hidden, n_outputs))
        self.input_bias = np.random.uniform(-k, k, (n_hidden))
        self.output_bias = np.random.uniform(-k, k, (n_outputs))
        
    def forward(self, inputs: np.ndarray) -> None:
        """
        Forward pass using the recurrent layer. Creates hidden states and output attributes.

        Parameters
        ----------
        inputs : numpy.ndarray
            Input matrix.

        Returns
        -------
        None
        """
        # Store inputs for later use
        self.inputs = inputs

        # Store number of samples
        self.n_samples = inputs.shape[0]

        self.sequence_length = inputs.shape[1]

        # Initialize output
        self.output = np.zeros((self.n_samples, self.sequence_length, self.output_weights.shape[1]))

        # Initialize hidden states
        self.hidden_states = np.zeros((self.n_samples, self.sequence_length, self.n_hidden))

        for i, sequence in enumerate(inputs):

            for j, x in enumerate(sequence):

                # Reshape to match dimensions
                x = x.reshape(1, -1)

                input_x = np.dot(x, self.input_weights)

                hidden_x = input_x + np.dot(self.hidden_states[i, max(j-1, 0)], self.hidden_weights) + self.input_bias

                # Activation function
                hidden_x = np.tanh(hidden_x)

                # Store current hidden state
                self.hidden_states[i, j] = hidden_x.copy()

                output_x = np.dot(hidden_x, self.output_weights) + self.output_bias

                # Store current output
                self.output[i, j] = output_x.copy()

    def backward(self, delta: np.ndarray) -> None:
        """
        Backward pass using the recurrent layer. 
        Creates gradient attributes with respect to input weights, hidden weights, output weights, input bias, output bias and inputs.

        Parameters
        ----------
        delta : np.ndarray
            Accumulated gradient obtained by backpropagation.

        Returns
        -------
        None
        """
        # Initialize gradient attributes
        self.dinput_weights = np.zeros_like(self.input_weights)
        self.dhidden_weights = np.zeros_like(self.hidden_weights)
        self.dinput_bias = np.zeros_like(self.input_bias)
        self.doutput_weights = np.zeros_like(self.output_weights)
        self.doutput_bias = np.zeros_like(self.output_bias)
        self.dinputs = np.zeros_like(self.inputs, dtype=np.float64)

        for i in range(self.n_samples - 1, -1, -1):

            # Initialize next hidden gradient
            next_hidden_gradient = None

            for j in range(self.sequence_length - 1, -1, -1):

                loss_gradient = delta[i, j].reshape(1, -1)
                hidden_state = self.hidden_states[i, j].reshape(-1, 1)

                self.doutput_weights += np.dot(hidden_state, loss_gradient)
                self.doutput_bias += loss_gradient.reshape(-1)

                hidden_gradient = np.dot(loss_gradient, self.output_weights.T)
                if next_hidden_gradient is not None:
                    hidden_gradient += np.dot(next_hidden_gradient, self.hidden_weights.T)

                dtanh = 1 - self.hidden_states[i, j]**2
                hidden_gradient *= dtanh

                next_hidden_gradient = hidden_gradient.copy()

                if j > 0:
                    self.dhidden_weights += np.dot(self.hidden_states[i, j-1].reshape(-1, 1), hidden_gradient)

                self.dinput_weights += np.dot(self.inputs[i, j].reshape(-1, 1), hidden_gradient)
                self.dinput_bias += hidden_gradient.reshape(-1)
                
                self.dinputs[i, j] += np.dot(self.input_weights, hidden_gradient.T).reshape(-1)

class RecurrentLayerHidden(Layer):

    def __init__(self, n_inputs: int, n_hidden: int) -> None:
        """
        Recurrent layer.

        Parameters
        ----------
        n_inputs : int
            Number of input features.

        n_hidden : int
            Number of hidden features.

        n_outputs : int
            Number of output features.

        Attributes
        ----------
        input_weights : numpy.ndarray
            Matrix of input weight coefficients.

        hidden_weights : numpy.ndarray
            Matrix of hidden weight coefficients.

        output_weights : numpy.ndarray
            Matrix of output weight coefficients.

        input_bias : numpy.ndaray
            Vector of input bias coefficients.

        output_bias : numpy.ndaray
            Vector of output bias coefficients.
        """
        k = 1 / np.sqrt(n_hidden)
        self.n_hidden = n_hidden
        self.input_weights = np.random.uniform(-k, k, (n_inputs, n_hidden))
        self.hidden_weights = np.random.uniform(-k, k, (n_hidden, n_hidden))
        self.input_bias = np.random.uniform(-k, k, (n_hidden))
        
    def forward(self, inputs: np.ndarray) -> None:
        """
        Forward pass using the recurrent layer. Creates hidden states and output attributes.

        Parameters
        ----------
        inputs : numpy.ndarray
            Input matrix.

        Returns
        -------
        None
        """
        # Store inputs for later use
        self.inputs = inputs

        # Store number of samples
        self.n_samples = inputs.shape[0]

        self.sequence_length = inputs.shape[1]

        # Initialize output
        self.output = np.zeros((self.n_samples, self.n_hidden))

        # Initialize hidden states
        self.hidden_states = np.zeros((self.n_samples, self.sequence_length, self.n_hidden))

        for i, sequence in enumerate(inputs):

            for j, x in enumerate(sequence):

                # Reshape to match dimensions
                x = x.reshape(1, -1)

                input_x = np.dot(x, self.input_weights)

                hidden_x = input_x + np.dot(self.hidden_states[i, max(j-1, 0)], self.hidden_weights) + self.input_bias

                # Activation function
                hidden_x = np.tanh(hidden_x)

                # Store current hidden state
                self.hidden_states[i, j] = hidden_x.copy()

            # Store current output
            self.output[i] = self.hidden_states[i, -1].copy()

    def backward(self, delta: np.ndarray) -> None:
        """
        Backward pass using the recurrent layer. 
        Creates gradient attributes with respect to input weights, hidden weights, output weights, input bias, output bias and inputs.

        Parameters
        ----------
        delta : np.ndarray
            Accumulated gradient obtained by backpropagation.

        Returns
        -------
        None
        """
        # Initialize gradient attributes
        self.dinput_weights = np.zeros_like(self.input_weights)
        self.dhidden_weights = np.zeros_like(self.hidden_weights)
        self.dinput_bias = np.zeros_like(self.input_bias)
        self.dinputs = np.zeros_like(self.inputs, dtype=np.float64)

        for i in range(self.n_samples - 1, -1, -1):

            # Initialize next hidden gradient
            next_hidden_gradient = None

            for j in range(self.sequence_length - 1, -1, -1):

                loss_gradient = delta[i].reshape(1, -1)

                hidden_gradient = loss_gradient.copy()
                if next_hidden_gradient is not None:
                    hidden_gradient += np.dot(next_hidden_gradient, self.hidden_weights)

                dtanh = 1 - self.hidden_states[i, j]**2
                hidden_gradient *= dtanh

                next_hidden_gradient = hidden_gradient.copy()

                if j > 0:
                    self.dhidden_weights += np.dot(self.hidden_states[i, j-1].reshape(-1, 1), hidden_gradient)

                self.dinput_weights += np.dot(self.inputs[i, j].reshape(-1, 1), hidden_gradient)
                self.dinput_bias += hidden_gradient.reshape(-1)
                
                self.dinputs[i, j] += np.dot(self.input_weights, hidden_gradient.T).reshape(-1)

class RNN:

    def __init__(self, n_inputs: int, n_hidden: int, n_layers: int = 1) -> None:
        """
        Recurrent neural network.

        Parameters
        ----------
        n_inputs : int
            Number of input features.

        n_hidden : int
            Number of hidden features.
        """
        self.recurrent_layers = [RecurrentLayerHidden(n_inputs, n_hidden)]
        if n_layers > 1:
            for _ in range(n_layers - 1):
                self.recurrent_layers.append(RecurrentLayerHidden(n_hidden, n_hidden))

    def forward(self, inputs: np.ndarray) -> None:

        self.recurrent_layers[0].forward(inputs)

        for idx, layer in enumerate(self.recurrent_layers[1:], start=1):
            layer.forward(self.recurrent_layers[idx - 1].hidden_states)

        self.output = self.recurrent_layers[-1].output.copy()

    def backward(self, delta: np.ndarray) -> None:

        self.recurrent_layers[-1].backward(delta)

        for idx, layer in reversed(list(enumerate(self.recurrent_layers[:-1]))):
            layer.backward(self.recurrent_layers[idx + 1].dinputs)

In [29]:
def convert_data_to_sequence(X, y, sequence_length):
    X_new = []
    y_new = []

    for i in range(len(X) - sequence_length):
        X_new.append(X[i:i+sequence_length, :])
        y_new.append(y[i+sequence_length])

    X_new, y_new = np.array(X_new), np.array(y_new)

    return X_new, y_new

In [30]:
from sklearn.preprocessing import StandardScaler

sequence = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).reshape(-1, 1)
scaler = StandardScaler()
sequence = scaler.fit_transform(sequence.reshape(-1, 1))
result = np.array([60, 52, 52, 53, 52, 50, 52, 56, 54, 57]).reshape(-1, 1)

In [31]:
seq_len = 3
sequence_new, result_new = convert_data_to_sequence(sequence, result, seq_len)

print(f'Sequence: {sequence.shape}, results: {result.shape}')
print(f'Sequence new: {sequence_new.shape}, results new: {result_new.shape}')

Sequence: (10, 1), results: (10, 1)
Sequence new: (7, 3, 1), results new: (7, 1)


In [32]:
from dlfs import Model
from dlfs.layers import RecurrentLayer
from dlfs.loss import MSE_Loss
from dlfs.optimizers import Optimizer_SGD

np.random.seed(0)

layers = [RecurrentLayer(1, 5, 3), 
          RecurrentLayer(3, 7, 1)]
lr = 5e-3

model = Model(layers=layers, loss_function=MSE_Loss(), optimizer=Optimizer_SGD(learning_rate=lr))

#model.train(sequence_new, result_new, epochs=500, batch_size=None, print_every=100)
#print(f'{model.predict(sequence)}')

In [33]:
from dlfs import Model
from dlfs.layers import RecurrentLayer, DenseLayer
from dlfs.loss import MSE_Loss
from dlfs.optimizers import Optimizer_SGD

np.random.seed(0)

layers = [RecurrentLayerHidden(1, 50), 
          DenseLayer(50, 1)]
lr = 5e-2

#result_new = np.array([53, 52, 50, 52, 56, 54, 57, 56]).reshape(-1, 1)

model = Model(layers=layers, loss_function=MSE_Loss(), optimizer=Optimizer_SGD(learning_rate=lr))

model.train(sequence_new, result_new, epochs=500, batch_size=None, print_every=100)
print(f'{model.predict(sequence)}')

===== EPOCH : 0 ===== LOSS : 1425.2647383778499 =====
===== EPOCH : 100 ===== LOSS : 1.3774086216882584 =====
===== EPOCH : 200 ===== LOSS : 1.3768691843246585 =====
===== EPOCH : 300 ===== LOSS : 1.3763303774884217 =====
===== EPOCH : 400 ===== LOSS : 1.3757920345979413 =====
===== EPOCH : 500 ===== LOSS : 1.3752541552485338 =====
[[45.39529815]
 [46.22327222]
 [47.04763335]
 [47.86604969]
 [48.67621924]
 [49.47590003]
 [50.26293922]
 [51.0353003 ]
 [51.79108747]
 [52.52856648]]


In [34]:
import pandas as pd

data = pd.read_csv('clean_weather.csv', names=['date', 'tmax', 'tmin', 'rain', 'tmax_tomorrow'], header=0)
data.head(10)
#data.tail(10)

Unnamed: 0,date,tmax,tmin,rain,tmax_tomorrow
0,1970-01-01,60.0,35.0,0.0,52.0
1,1970-01-02,52.0,39.0,0.0,52.0
2,1970-01-03,52.0,35.0,0.0,53.0
3,1970-01-04,53.0,36.0,0.0,52.0
4,1970-01-05,52.0,35.0,0.0,50.0
5,1970-01-06,50.0,38.0,0.0,52.0
6,1970-01-07,52.0,43.0,0.0,56.0
7,1970-01-08,56.0,49.0,0.24,54.0
8,1970-01-09,54.0,50.0,0.4,57.0
9,1970-01-10,57.0,50.0,0.0,57.0


In [35]:
from sklearn.preprocessing import StandardScaler

FEATURES = ['tmax', 'tmin', 'rain']
TARGET = 'tmax_tomorrow'

X = data[FEATURES].to_numpy()
y = data[TARGET].to_numpy()

print(f'X: {X.shape}')
print(f'y: {y.shape}')

X: (13509, 3)
y: (13509,)


In [36]:
threshold = 0.02

X_train = X[:int(threshold*len(X)),:].copy()
y_train = y[:int(threshold*len(X))].copy()

X_test = X[int(threshold*len(X)):,:].copy()
y_test = y[int(threshold*len(X)):].copy()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')

X_train: (270, 3), y_train: (270,)
X_test: (13239, 3), y_test: (13239,)


In [37]:
seq_len = 7
X_train_new, y_train_new = convert_data_to_sequence(X_train, y_train.reshape(-1, 1), seq_len)
X_test_new, y_test_new = convert_data_to_sequence(X_test, y_test.reshape(-1, 1), seq_len)
print(f'shapes: {X_train_new.shape}, {y_train_new.shape}')
print(f'shapes: {X_test_new.shape}, {y_test_new.shape}')

shapes: (263, 7, 3), (263, 1)
shapes: (13232, 7, 3), (13232, 1)


In [38]:
np.random.seed(0)
layers = [RecurrentLayerHidden(3, 4),
          DenseLayer(4, 1)]
lr = 1e-4

model = Model(layers=layers, loss_function=MSE_Loss(), optimizer=Optimizer_SGD(learning_rate=lr, momentum=0.5))
model.train(X_train_new, y_train_new, epochs=50, batch_size=1, print_every=10)

===== EPOCH : 0 ===== LOSS : 3664.86700090967 =====
===== EPOCH : 10 ===== LOSS : 1755.8492319681461 =====
===== EPOCH : 20 ===== LOSS : 1042.3108602829775 =====
===== EPOCH : 30 ===== LOSS : 746.4693865325233 =====
===== EPOCH : 40 ===== LOSS : 612.2120418247232 =====
===== EPOCH : 50 ===== LOSS : 546.2726310394227 =====


In [39]:
X_test_new, y_test_new = convert_data_to_sequence(X_test, y_test, sequence_length=10)

In [40]:
from random import randint

start = randint(0, len(X_test_new))
y_pred = model.predict(X_test_new[start:start+5].reshape(5, *X_test_new[0].shape))
print(y_pred)
print(y_test[start:start+5])
print(f'Loss: {model.loss_function.calculate(y_pred, y_test[start:start+5])}')

[[55.12179357]
 [54.42053747]
 [57.55431227]
 [60.38021396]
 [56.42478534]]
[69. 59. 58. 60. 52.]
Loss: 21.095672518281834


In [46]:
print(f'x_train: {X_train_new.shape}')
print(f'y_train: {y_train_new.shape}')

x_train: (263, 7, 3)
y_train: (263, 1)


In [61]:
np.random.seed(0)
from dlfs.layers import RNN

layers = [RNN(3, 1), DenseLayer(1, 1)]

lr = 1e-3

model = Model(layers=layers, loss_function=MSE_Loss(), optimizer=Optimizer_SGD(learning_rate=lr, momentum=0.))
model.train(X_train_new, y_train_new, epochs=50, batch_size=None, print_every=10)

===== EPOCH : 0 ===== LOSS : 2196.569995184542 =====
===== EPOCH : 10 ===== LOSS : 28.844170899675913 =====
===== EPOCH : 20 ===== LOSS : 202.39847466369582 =====
===== EPOCH : 30 ===== LOSS : 40.41449498259472 =====
===== EPOCH : 40 ===== LOSS : 26.10478566903436 =====
===== EPOCH : 50 ===== LOSS : 24.833312614288186 =====


In [50]:
print(layers[0].recurrent_layers[-1].output.shape)

(263, 15)


In [51]:
model.loss_function.calculate(layers[0].recurrent_layers[-1].output, y_train_new)

2131.7148288973385

In [42]:
model = RNN(3, 4, n_layers=5)
model.forward(X_train_new)

In [43]:
loss_fn = MSE_Loss()
y_pred = model.output.copy()
loss_fn.backward(y_pred, y_train_new)
#print(grad)

In [44]:
model.backward(loss_fn.dinputs)