# ***Deep Deterministic Policy Gradients with Instinctive Network***

In [None]:
%matplotlib qt

import gym, time, math
import tensorflow as tf, matplotlib.pyplot as plt, numpy as np

from tensorflow import random_uniform_initializer
from tensorflow.keras.layers import Input, Dense, Concatenate, Lambda, BatchNormalization 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from typing import Tuple, List
from minisom import MiniSom

global_seed = 42
tf.random.set_seed(global_seed)
np.random.seed(global_seed)

---
### **OU Action Noise**

In [None]:
class OUActionNoise(object):
    def __init__(self, mean, sigma=0.5, theta=0.2, dt=0.1, x0=None):
        self.mean = mean
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.reset()
    
    #--------------------------------------------------------------------------------
    #Method that enables to write classes where the instances behave like functions and can be called like a function.    
    def __call__(self):
        x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        self.x_prev = x
        
        return x
    
    #--------------------------------------------------------------------------------
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mean)

In [None]:
%%script false --no-raise-error

a = np.zeros(15)
b = OUActionNoise(a)
a += b()
a

---
### **Replay Buffer**

In [None]:
class ReplayBuffer(object):
    def __init__(self, size, minibatch_size = None):
        '''
        Args:
            size (integer): The size of the replay buffer.              
            minibatch_size (integer): The sample size.
        '''
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState()
        self.max_size = size
        
    #--------------------------------------------------------------------------------    
    def append(self, state, action, reward, next_state, context, next_context, done):
        '''
        Args:
            state (Numpy array): The state.              
            action (integer): The action.
            reward (float): The reward.
            done (boolen): True if the next state is a terminal state and False otherwise.
            Is transformed to integer so tha True = 1, False = 0
            next_state (Numpy array): The next state.           
        '''
        if self.size() == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, next_state, context, next_context, int(done)])
    
    #--------------------------------------------------------------------------------    
    def sample(self):
        '''
        Returns:
            A list of transition tuples including state, action, reward, terminal, and next_state
        '''
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]
    
    #--------------------------------------------------------------------------------    
    def size(self):
        '''
        Returns:
            Number of elements in the buffer
        '''
        return len(self.buffer)
    
    #--------------------------------------------------------------------------------
    def isMin(self):
        '''
        Returns:
            Boolean indicating if the memory have the minimum number of elements or not
        '''
        return (self.size() >= self.minibatch_size)
    
    #--------------------------------------------------------------------------------
    def empties(self):
        self.buffer.clear()
    
    #--------------------------------------------------------------------------------
    def getEpisode(self):
        '''
        Returns:
            List with all the elements in the buffer
        '''
        return self.buffer

---
### **Instinctive Weights**

In [None]:
class InstinctiveWeights:
    def __init__(self, som_shape: Tuple[int, int]):
        self.som_shape = som_shape
        self.num_neurons = np.prod(self.som_shape)
        self.weights = np.zeros(int((self.num_neurons*(self.num_neurons+1))/2))
        self.activations = np.zeros(self.som_shape)


    def __neuron_to_position(self, neuron: Tuple[int, int]) -> int:
        i, j = neuron
        return int((i*self.som_shape[1]) + j)


    def __positions_to_index(self, position: Tuple[int, int]) -> int:
        i, j = position
        return int(((i*(i+1))/2) + j)


    def __get_tril_positions(self, n1: Tuple[int, int], n2: Tuple[int, int]) -> Tuple[int, int]:
        n1_pos_aux = self.__neuron_to_position(n1)
        n2_pos_aux = self.__neuron_to_position(n2)

        n1_pos = max(n1_pos_aux, n2_pos_aux)
        n2_pos = min(n1_pos_aux, n2_pos_aux)

        return n1_pos, n2_pos


    def reinforce_connection(self, n1: Tuple[int, int], n2: Tuple[int, int]) -> None:
        weight_position = self.__get_tril_positions(n1, n2)
        array_index = self.__positions_to_index(weight_position)
        value = self.weights[array_index]

        value += 0.1
        value = np.clip(value, 0, 1)

        self.weights[array_index] = value

    
    def step(self):
        self.weights *= 0.995

    
    def get_weight_matrix(self):
        full_matrix = np.zeros((self.num_neurons, self.num_neurons))

        # Fill the lower triangular part of the matrix
        index = 0
        for i in range(self.num_neurons):
            for j in range(i + 1):
                full_matrix[i, j] = self.weights[index]
                index += 1

        # Mirror the lower triangular part to the upper triangular part
        full_matrix = full_matrix + full_matrix.T - np.diag(np.diag(full_matrix))
        
        return full_matrix

---
### **Instinctive Layer**

In [None]:
class InstinctiveLayer:
    def __init__(
        self, som_shape: Tuple[int, int], func_x_max: int, func_x_drop: float, 
        func_y_max: float, func_y_min: float, act_threshold: float,
    ):
        self.som_shape = som_shape
        self.act_threshold = act_threshold
        self.func_x_max = func_x_max
        self.func_x_drop = func_x_drop
        self.func_y_max = func_y_max
        self.func_y_min = func_y_min

        self.charges = np.zeros(self.som_shape) + self.func_x_max/100


    def get_act_array(self):
        return np.reshape(self.get_activations(), -1)


    def get_active(self) -> List[Tuple[int, int]]:
        act = self.get_activations()
        active = np.where(act == 1)
        return [(i, j) for i, j in zip(*active)]


    def step(self, weights, som_act) -> None:        
        charge_delta = np.reshape(np.dot((self.charges * self.get_activations()).reshape((1, -1)), weights), self.charges.shape)
        charge_delta = np.clip(charge_delta, 0, self.func_x_max/50)
        
        som_act *= self.func_x_max/50

        self.charges += (charge_delta + som_act)

        self.charges[self.charges > self.func_x_drop] += self.func_x_max/100
        self.charges[self.charges <= self.func_x_drop] -= self.func_x_max/100

        self.charges[self.charges > self.func_x_max] = 0
        self.charges[self.charges < 0] = 0

        return


    def __apply_activation_function(self, charges):
    # Define the piecewise function
        func = np.vectorize(lambda charges_data: np.piecewise(
            charges_data,
            [
                charges_data <= self.func_x_drop, 
                self.func_x_drop < charges_data <= self.func_x_max,
            ],
            [
                lambda x: (self.func_y_max / self.func_x_drop**2) * x**2,  # Second degree growth
                lambda x: self.func_y_min + (0 - self.func_y_min) / (self.func_x_max - self.func_x_drop) * (x - self.func_x_drop) # Linear growth
            ]
        ))
        return func(charges)
    

    def reset_charges(self):
        self.charges = np.zeros(self.som_shape) + self.func_x_max/100
    
    
    def get_activations(self):
        activations = self.__apply_activation_function(self.charges)
        activations = np.clip(activations, 0, self.act_threshold)
        activations = activations/self.act_threshold
        activations[self.charges > self.func_x_drop] = self.func_x_max/100
        return activations
    
    
    def plot_act_func(self):
        # Generate x values
        x = np.linspace(0, self.func_x_max, 1000)
        y = self.__apply_activation_function(x)

        # Plot the function
        plt.plot(x, y, label="Piecewise Function with Repetition")
        plt.xlabel('x')
        plt.ylabel('f(x)')
        plt.title('Piecewise Function: Exponential Growth, Peak, Drop, Linear Growth, and Repeat')
        plt.legend()
        plt.grid(True)
        plt.axhline(y=self.func_y_max, color='gray', linestyle='--')
        plt.axhline(y=self.func_y_min, color='gray', linestyle='--')
        plt.axvline(x=self.func_x_drop, color='gray', linestyle='--')
        plt.axvline(x=self.func_x_max, color='gray', linestyle='--')
        plt.axhline(y = 0, color = 'r', linestyle = '-') 
        plt.axhline(y = self.act_threshold, color = 'r', linestyle = '-') 
        plt.show()

In [None]:
%%script false --no-raise-error

som_shape = (4, 4)
func_x_max = 50
func_x_drop = 30 
func_y_max = 10 
func_y_min = -7 
act_threshold = 5

inst_w = InstinctiveWeights(som_shape)
inst_l= InstinctiveLayer(
    som_shape = som_shape,
    func_x_max = func_x_max, 
    func_x_drop = func_x_drop, 
    func_y_max = func_y_max, 
    func_y_min = func_y_min, 
    act_threshold = act_threshold,
)

display(inst_l.plot_act_func())

def get_act():
    inst_l.step(inst_w.get_weight_matrix(), np.random.uniform(low=0, high=1, size=som_shape))
    return inst_l.get_activations(), inst_l.charges.copy()

_ = [inst_w.reinforce_connection(**connection) 
    for connection in [
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 0)},
        {'n1': (0, 0), 'n2': (0, 0)},
        {'n1': (0, 0), 'n2': (0, 0)},
        {'n1': (0, 0), 'n2': (0, 1)},
        {'n1': (0, 1), 'n2': (1, 0)},
    ]]
acts = [get_act() for _ in range(10)]

_ = [inst_w.reinforce_connection(**connection) 
    for connection in [
        {'n1': (0, 1), 'n2': (1, 0)},
        {'n1': (0, 1), 'n2': (1, 0)},
        {'n1': (1, 0), 'n2': (2, 2)},
        {'n1': (2, 2), 'n2': (2, 2)},
        {'n1': (2, 2), 'n2': (0, 3)},
        {'n1': (2, 2), 'n2': (0, 3)},
        {'n1': (2, 2), 'n2': (0, 3)},
    ]]
acts.extend([get_act() for _ in range(10)])

arrays = [act[0] for act in acts]

# Determine the number of subplots
num_arrays = len(arrays)
n = arrays[0].shape[0]

# Determine the grid size
cols = int(np.ceil(np.sqrt(num_arrays)))
rows = int(np.ceil(num_arrays / cols))

# Create the subplots
fig, axes = plt.subplots(rows, cols, figsize=(8, 5))

# Plot each array in its subplot
for i, array in enumerate(arrays):
    ax = axes.flat[i]
    cax = ax.matshow(array, cmap='viridis', vmin=0, vmax=1)
    fig.colorbar(cax, ax=ax)
    ax.set_title(f'Array {i + 1}')

# Remove any empty subplots
for j in range(i + 1, len(axes.flat)):
    fig.delaxes(axes.flat[j])

plt.tight_layout()
plt.show()

---
### **Instinctive Network**

In [None]:
class InstinctiveNetwork:

    def __init__(self, som_dims: Tuple[int, int], input_dim: int, som_kwargs: dict, inst_net_kwargs: dict):
        self.som_dims = som_dims
        self.input_dim = input_dim
        self.som = MiniSom(*self.som_dims, self.input_dim, **som_kwargs)
        self.inner_weights = InstinctiveWeights(self.som_dims)
        self.inner_layer = InstinctiveLayer(som_shape = self.som_dims, **inst_net_kwargs)
        self.last_winner = None


    def train_som(self, data):
        self.som.train(data, 5, verbose=False)


    def reinforce_connection(self, data):
        som_winner = self.som.winner(data)
        if self.last_winner is not None:
            self.inner_weights.reinforce_connection(self.last_winner, som_winner)
        self.last_winner = som_winner
        return
    

    def reset_charges(self):
        self.inner_layer.reset_charges()
        self.last_winner = None
        return


    def get_output(self, data, reinforce=True):
        if reinforce:
            self.inner_weights.step()
            self.reinforce_connection(data)

        som_act = self.som.activate([data])
        som_act = 1 - ((som_act-np.amin(som_act))/(np.amax(som_act)-np.amin(som_act)))
        
        som_act_dist = som_act.copy()
        som_act_dist[som_act_dist < 0.9] = 0

        weights = self.inner_weights.get_weight_matrix()
        self.inner_layer.step(weights, som_act_dist)

        # active = self.inner_layer.get_active()
        # active = np.array([self.som.get_weights()[x, y] for x, y in active])
        # if not len(active):
        #     return np.zeros(self.input_dim), act
        # active = np.mean(active, axis=0)

        active = self.inner_layer.get_act_array()
        return active, som_act

In [None]:
%%script false --no-raise-error

inst_net = InstinctiveNetwork(
    som_dims = (5, 5), 
    input_dim = 16,
    som_kwargs = {
        'sigma': 3,
        'learning_rate': 0.05,
    }, 
    inst_net_kwargs = {
        'func_x_max': 50,
        'func_x_drop': 30,
        'func_y_max': 10,
        'func_y_min': -7,
        'act_threshold': 5,
    }
)

data = np.random.rand(100, 16)
inst_net.train_som(data)
out = []

for i in data:
    inst_net.reinforce_connection(i)
    out.append(inst_net.get_output(i))

np.shape(out)

In [None]:
# TODO: Adaptar actor e critic para receber inputs da InstinctiveNetwork e trinamentos da instinctive:
    # * treina SOM junto com redes -> pega todos os estados/ações da batch e treina
    # * treinar inst_layer em relatime, a medida que for gerando um novo estado treina ao gerar a saida para ele

---
### **Actor**

In [None]:
class Actor(object):
    def __init__(self, s_inp_dim, s_fc1_dim, con_inp_dim, con_fc1_dim, fc2_dim, fc3_dim, out_dim, act_range, lr, tau):
        #Network dimensions
        self.s_inp_dim = s_inp_dim
        self.s_fc1_dim = s_fc1_dim
        self.con_inp_dim = con_inp_dim
        self.con_fc1_dim = con_fc1_dim
        self.fc2_dim = fc2_dim
        self.fc3_dim = fc3_dim
        self.out_dim = out_dim
        #Range of the action space
        self.act_range = act_range
        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau
        #Optimizer learning rate
        self.lr = lr
        #Generates the optimization function
        self.optimizer = Adam(learning_rate=self.lr)
        #Generates the actor model
        self.model = self.buildNetwork()
        #Generates the actor target model
        self.target_model = self.buildNetwork()
        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())
        
    #--------------------------------------------------------------------
    def buildNetwork(self):
        inp = Input(shape=(self.s_inp_dim,))
        f1 = 1 / np.sqrt(self.s_fc1_dim)
        fc1 = Dense(self.s_fc1_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f1, f1), bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64')(inp)
        norm1 = BatchNormalization(dtype='float64')(fc1)

        inp_con = Input(shape=(self.con_inp_dim,))
        f1 = 1 / np.sqrt(self.con_fc1_dim)
        fc1_con = Dense(self.con_fc1_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f1, f1), bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64')(inp_con)
        norm1_con = BatchNormalization(dtype='float64')(fc1_con)
        
        c_inp = Concatenate(dtype='float64')([norm1, norm1_con])
        
        f2 = 1 / np.sqrt(self.fc2_dim)
        fc2 = Dense(self.fc2_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f2, f2), bias_initializer=random_uniform_initializer(-f2, f2), dtype='float64')(c_inp)
        norm2 = BatchNormalization(dtype='float64')(fc2)

        f3 = 1 / np.sqrt(self.fc3_dim)
        fc3 = Dense(self.fc3_dim, activation='relu', kernel_initializer=random_uniform_initializer(-f3, f3), bias_initializer=random_uniform_initializer(-f3, f3), dtype='float64')(norm2)
        norm3 = BatchNormalization(dtype='float64')(fc3)
        
        f3 = 0.003
        out = Dense(self.out_dim, activation='tanh', kernel_initializer=random_uniform_initializer(-f3, f3), bias_initializer=random_uniform_initializer(-f3, f3), dtype='float64')(norm3)
        lamb = Lambda(lambda i: i * self.act_range, dtype='float64')(out)
        
        return Model(inputs=[inp, inp_con], outputs=[lamb])
    
    #--------------------------------------------------------------------
    def predict(self, states, contexts):
        return self.model([states, contexts], training=False)
    
    #--------------------------------------------------------------------
    def target_predict(self, states, contexts):
        return self.target_model([states, contexts], training=False)
    
    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save_weights(path + '_actor.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.model.load_weights(path)

---
### **Critic**

In [None]:
class Critic(object):
    def __init__(self, state_inp_dim, state_fc1_dim, action_inp_dim, action_fc1_dim, conc_fc1_dim, conc_fc2_dim, out_dim, lr, tau):
        #Network dimensions
        self.state_inp_dim = state_inp_dim
        self.state_fc1_dim = state_fc1_dim
        self.action_inp_dim = action_inp_dim
        self.action_fc1_dim = action_fc1_dim
        self.conc_fc2_dim = conc_fc2_dim
        self.conc_fc1_dim = conc_fc1_dim
        self.out_dim = out_dim
        #Optimizer learning rate
        self.lr = lr
        #Define the critic optimizer
        self.optimizer = Adam(learning_rate=self.lr)
        #Parameter that coordinates the soft updates on the target weights
        self.tau = tau
        #Generate the critic network
        self.model = self.buildNetwork()
        #Generate the critic target network
        self.target_model = self.buildNetwork()
        #Set the weights to be the same in the begining
        self.target_model.set_weights(self.model.get_weights())

    #--------------------------------------------------------------------
    def buildNetwork(self):
        #State input network ---------
        s_inp = Input(shape=(self.state_inp_dim, ))
        f1 = 1 / np.sqrt(self.state_fc1_dim)
        s_fc1 = Dense(
            self.state_fc1_dim, activation='relu', 
            kernel_initializer=random_uniform_initializer(-f1, f1), 
            bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64'
        )(s_inp)
        s_norm1 = BatchNormalization(dtype='float64')(s_fc1)
        
        #Action input network ---------
        a_inp = Input(shape=(self.action_inp_dim, ))
        f1 = 1 / np.sqrt(self.action_fc1_dim)
        a_fc1 = Dense(
            self.action_fc1_dim, activation='relu', 
            kernel_initializer=random_uniform_initializer(-f1, f1), 
            bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64'
        )(a_inp)
        a_norm1 = BatchNormalization(dtype='float64')(a_fc1)
        
        #Concatenate the two networks ---
        c_inp = Concatenate(dtype='float64')([s_norm1, a_norm1])
        
        #Creates the output network
        f1 = 1 / np.sqrt(self.conc_fc1_dim)
        c_fc1 = Dense(
            self.conc_fc1_dim, activation='relu', 
            kernel_initializer=random_uniform_initializer(-f1, f1), 
            bias_initializer=random_uniform_initializer(-f1, f1), dtype='float64'
        )(c_inp)
        c_norm1 = BatchNormalization(dtype='float64')(c_fc1)

        f2 = 1 / np.sqrt(self.conc_fc2_dim)
        c_fc2 = Dense(
            self.conc_fc2_dim, activation='relu', 
            kernel_initializer=random_uniform_initializer(-f2, f2), 
            bias_initializer=random_uniform_initializer(-f2, f2), dtype='float64'
        )(c_norm1)
        c_norm2 = BatchNormalization(dtype='float64')(c_fc2)
        
        f3 = 0.003
        out = Dense(
            self.out_dim, activation='linear', 
            kernel_initializer=random_uniform_initializer(-f3, f3), 
            bias_initializer=random_uniform_initializer(-f3, f3), dtype='float64'
        )(c_norm2)
        
        model = Model(inputs=[s_inp, a_inp], outputs=[out])
        
        return model
    
    #--------------------------------------------------------------------
    def predict(self, states, actions):
        return self.model([states, actions], training=False)
    
    #--------------------------------------------------------------------
    def target_predict(self, states, actions):
        return self.target_model([states, actions], training=False)
    
    #--------------------------------------------------------------------
    def transferWeights(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        new_weights = []
        
        for i in range(len(weights)):
            new_weights.append((self.tau * weights[i]) + ((1.0 - self.tau) * target_weights[i]))
        
        self.target_model.set_weights(new_weights)
        
    #--------------------------------------------------------------------
    def saveModel(self, path):
        self.model.save_weights(path + '_critic.h5')
    
    #--------------------------------------------------------------------
    def loadModel(self, path):
        self.model.load_weights(path)

---
### **DDPG Agent**

In [None]:
class DDPGAgent(object):
    def __init__(
        self, state_dim, action_dim, action_min, action_max, 
        memory_size, batch_size, gamma, a_lr, c_lr, tau, epsilon, 
        epsilon_decay, epsilon_min, max_steps, env_name
    ):
        
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_min = action_min
        self.action_max = action_max
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.a_lr = a_lr
        self.c_lr = c_lr
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.max_steps = max_steps
        self.env_name = env_name

        #Creates the Replay Buffer
        self.memory = ReplayBuffer(self.memory_size, self.batch_size)

        # creates instinctive network
        som_dim = (15, 15)
        self.inst_net = InstinctiveNetwork(
            som_dims = som_dim, 
            input_dim = self.state_dim,
            som_kwargs = {
                'sigma': 1,
                'learning_rate': 0.5,
                'decay_function': lambda x, y, z: x,
                'neighborhood_function': 'bubble'
            }, 
            inst_net_kwargs = {
                'func_x_max': 50,
                'func_x_drop': 35,
                'func_y_max': 10,
                'func_y_min': -7,
                'act_threshold': 4,
            }
        )

        #Creates the actor
        self.actor = Actor(
            s_inp_dim=self.state_dim, 
            s_fc1_dim=128,
            con_inp_dim=np.prod(som_dim), 
            con_fc1_dim=1024,
            fc2_dim=256, 
            fc3_dim=64,
            out_dim=self.action_dim, 
            act_range=self.action_max, 
            lr=self.a_lr, 
            tau=self.tau,
        )

        #Creates the critic
        self.critic = Critic(
            state_inp_dim=self.state_dim, 
            state_fc1_dim=64, 
            action_inp_dim=self.action_dim, 
            action_fc1_dim=32,
            conc_fc1_dim=1024, 
            conc_fc2_dim=256,
            out_dim=1,
            lr=self.c_lr, 
            tau=self.tau,
        )
        
        #Creates the noise generator
        self.ou_noise = OUActionNoise(mean=np.zeros(action_dim))

        self.create_plot()
        return


    def create_plot(self):
        self.fig = plt.figure()

        self.som_act_plot = self.fig.add_subplot(211)
        self.som_act_plot.title.set_text('SOM Activation')

        self.instinctive_layer_plot = self.fig.add_subplot(212)
        self.instinctive_layer_plot.title.set_text('Instinctive Layer Activations')
        return


    def update_plots(self, som_act, inst_layer_act):
        self.som_act_plot.imshow(som_act)
        self.instinctive_layer_plot.imshow(inst_layer_act)

        self.fig.canvas.draw()
        self.fig.canvas.flush_events()
        return


    def policy(self, state, explore=True):
        context, som_act = self.inst_net.get_output(state, self.memory.isMin())
        action = self.actor.predict(np.reshape(state, (1, -1)), np.reshape(context, (1, -1)))[0]
        #Takes the exploration with the epsilon probability
        if explore and np.random.rand() < self.epsilon:
            action += self.ou_noise()
            
        action = np.clip(action, a_min=self.action_min, a_max=self.action_max)
        return action, context, som_act


    def learn(self, state, action, reward, next_state, context, next_context, done):
        self.memory.append(state, action, reward, next_state, context, next_context, done)
        
        if self.memory.isMin():
            self.replay_memory()
        return


    def replay_memory(self):
        # Get sample experiences from the replay buffer
        experiences = self.memory.sample()
        
        #Get each term of the esxperiences
        states = np.array([exp[0] for exp in experiences])
        actions = np.array([exp[1] for exp in experiences])
        rewards = np.array([exp[2] for exp in experiences])
        next_states = np.array([exp[3] for exp in experiences])
        contexts = np.array([exp[4] for exp in experiences])
        next_contexts = np.array([exp[5] for exp in experiences])
        done = np.array([int(exp[6]) for exp in experiences])

        self.inst_net.train_som(states)
        
        #Change the dimensions of the rewards and done arrays
        rewards = rewards[:, np.newaxis]
        done = done[:, np.newaxis]
        
        #Train the critic
        with tf.GradientTape() as tape:
            #Compute the critic target values
            target_actions = self.actor.target_predict(next_states, next_contexts)
            y = rewards + self.gamma * self.critic.target_predict(next_states, target_actions) * (1 - done)
            #Compute the q_value of each next_state, next_action pair
            critic_value = self.critic.predict(states, actions)
            #Compute the critic loss 
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(critic_grad, self.critic.model.trainable_variables))
        
        #Train the actor
        with tf.GradientTape() as tape:
            acts = self.actor.predict(states, contexts)
            critic_grads = self.critic.predict(states, acts)
            #Used -mean as we want to maximize the value given by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_grads)
            
        actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_grad, self.actor.model.trainable_variables))
        
        #Update the model weights
        self.actor.transferWeights()
        self.critic.transferWeights() 
        
        #Decay the epsilon value
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        #If its reach the minimum value it stops
        else:
            self.epsilon = self.epsilon_min

        return


    def act(self):
        #Reset the envirorment
        env2 = gym.make(self.env_name, hardcore=True, render_mode='human')
        observation, _ = env2.reset()
        self.inst_net.reset_charges()
        done = False
        step = 0
        
        while not done:
            env2.render()
            action, ctx, som_act = self.policy(observation, explore=False)
            #self.update_plots(som_act, self.inst_net.inner_layer.get_activations())
            new_observation, ___, done, _, __ = env2.step(action)
            observation = new_observation
            step += 1
            done = done or (step > self.max_steps)
        
        env2.close()
        return


    def train(self, env, num_episodes, verbose, verbose_num, end_on_complete, complete_num, complete_value, act_after_batch, plot_act):
        scores_history = []
        steps_history = []

        print("BEGIN\n")
        complete = 0
        
        for episode in range(num_episodes):
            done = False
            score = 0
            steps = 0
            observation, _ = env.reset()
            self.inst_net.reset_charges()
            
            while not done:
                action, context, som_act = self.policy(observation)
                
                if verbose:
                    print("\r                                                                                                     ", end="")
                    print(f"\rEpisode: {str(episode+1)} \tStep: {str(steps)} \tReward: {str(score)}", end="")
                
                new_observation, reward, done, _, __ = env.step(action)
                act, new_context, som_act = self.policy(new_observation)
                
                if steps > self.max_steps:
                    reward = -50
                    done = True

                self.learn(observation, action, reward, new_observation, context, new_context, done)
                observation = new_observation
                score += reward
                steps += 1

                if plot_act:
                    self.update_plots(som_act, self.inst_net.inner_layer.get_activations())

            scores_history.append(score)
            steps_history.append(steps)
            
            #If the score is bigger or equal than the complete score it add one to the completed number
            if(score >= complete_value):
                complete += 1
                #If the flag is true the agent ends the trainig on the firs complete episode
                if end_on_complete and complete >= complete_num: break
            
            #These information are printed after each verbose_num episodes
            if((episode+1)%verbose_num == 0):
                print("\r                                                                                                          ", end="")
                print(f'''\rEpisodes: {episode+1}/{num_episodes}\n\tTotal reward: {np.mean(scores_history[-verbose_num:])} +- {np.std(scores_history[-verbose_num:])}\n\tNum. steps: {np.mean(steps_history[-verbose_num:])} +- {np.std(steps_history[-verbose_num:])}\n\tCompleted: {complete}\n--------------------------''')
                
                #If the flag is true the agent act and render the episode after each verbose_num episodes
                if act_after_batch: self.act()
                
                #Set the number of completed episodes on the batch to zero
                complete = 0

        print("\nFINISHED")
        
        return scores_history, steps_history


    def save(self, path):
        self.actor.saveModel(path)
        self.critic.saveModel(path)
        return


    def load(self, a_path, c_path):
        self.actor.loadModel(a_path)
        self.critic.loadModel(c_path)
        return

---
### **Test**

In [None]:
name = "BipedalWalker-v3"
env = gym.make(name, render_mode='rgb_array', hardcore=True)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_min = env.action_space.low
action_max = env.action_space.high

memory_size = 1000000
batch_size = 200
gamma = 0.99
a_lr = 5e-4
c_lr = 9e-4
tau = 8e-3
epsilon = 1
epsilon_decay = 0.9999
epsilon_min = 0.5
max_steps = 1600

agent = DDPGAgent(
    state_dim, action_dim, action_min, action_max, 
    memory_size, batch_size, gamma, a_lr, c_lr, tau, 
    epsilon, epsilon_decay, epsilon_min, max_steps, name
)

In [None]:
num_episodes = 3000
verbose = True
verbose_num = 50
end_on_complete = True
complete_num = 2
complete_value = 300
act_after_batch = True

agent.train(
    env, num_episodes, verbose, 
    verbose_num, end_on_complete, 
    complete_num, complete_value, 
    act_after_batch, plot_act=False
)

In [None]:
agent.act()

In [None]:
#agent.save('/home/gustavo/PROG/RL_networks/11.1_DDPG_'+name+'/')