# ***Deep Deterministic Policy Gradients with Instinctive Network***

In [1]:
%matplotlib qt

import gym
import tensorflow as tf, matplotlib.pyplot as plt, numpy as np

from tensorflow.keras.layers import Dense, BatchNormalization 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from typing import Tuple

global_seed = 42
tf.random.set_seed(global_seed)
np.random.seed(global_seed)

2024-09-10 16:15:40.646813: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


---
### **OU Action Noise**

In [2]:
class OUActionNoise:
    def __init__(self, mean, sigma=0.7, theta=0.3, dt=0.1, x0=None):
        self.mean = tf.constant(mean, dtype=tf.float32)
        self.sigma = tf.constant(sigma, dtype=tf.float32)
        self.theta = tf.constant(theta, dtype=tf.float32)
        self.dt = tf.constant(dt, dtype=tf.float32)
        self.x0 = x0
        self.reset()


    @tf.function
    def __call__(self):
        x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + \
            self.sigma * tf.sqrt(self.dt) * tf.random.normal(self.mean.shape)
        self.x_prev = x
        return x


    def reset(self):
        if self.x0 is not None:
            self.x_prev = tf.constant(self.x0, dtype=tf.float32)
        else:
            self.x_prev = tf.zeros_like(self.mean)
        return

---
### **Replay Buffer**

In [3]:
class ReplayBuffer(object):
    def __init__(self, size, minibatch_size = None):
        self.buffer = []
        self.minibatch_size = minibatch_size
        self.rand_generator = np.random.RandomState()
        self.max_size = size


    def append(self, state, action, reward, next_state, context, next_context, done):
        if self.size() == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward, next_state, context, next_context, int(done)])
        return


    def sample(self):
        idxs = self.rand_generator.choice(np.arange(len(self.buffer)), size=self.minibatch_size)
        return [self.buffer[idx] for idx in idxs]


    def size(self):
        return len(self.buffer)


    def isMin(self):
        return (self.size() >= self.minibatch_size)


    def empties(self):
        self.buffer.clear()
        return


    def getEpisode(self):
        return self.buffer

---
### **SOM**

In [4]:
class SOM:
    def __init__(self, m, n, dim, n_iterations, alpha, sigma=None):
        self.m = m
        self.n = n
        self.dim = dim
        self.n_iterations = n_iterations
        
        if sigma is None:
            sigma = max(m, n) / 2.0
        
        self.alpha = tf.Variable(alpha, dtype=tf.float32)
        self.sigma = tf.Variable(sigma, dtype=tf.float32)
        
        self.weights = tf.Variable(tf.random.uniform([m * n, dim]))
        self.locations = tf.constant([(i, j) for i in range(m) for j in range(n)], dtype=tf.float32)
    
    
    @tf.function
    def get_bmu(self, input_vector):
        distances = tf.reduce_sum(tf.square(tf.cast(self.weights, dtype=tf.float32) - tf.cast(input_vector, dtype=tf.float32)), axis=1)
        bmu_index = tf.argmin(distances)
        return bmu_index
    

    @tf.function
    def update_weights(self, input_vector, bmu_index):
        bmu_location = tf.gather(self.locations, bmu_index)
        distance_sq = tf.reduce_sum(tf.square(self.locations - bmu_location), axis=1)
        neighborhood = tf.exp(-distance_sq / (2 * tf.square(self.sigma)))
        
        learning = tf.expand_dims(tf.cast(neighborhood, dtype=tf.float32) * tf.cast(self.alpha, dtype=tf.float32), axis=1) * (tf.cast(input_vector, dtype=tf.float32) - tf.cast(self.weights, dtype=tf.float32))
        self.weights.assign_add(learning)
        return
    

    @tf.function
    def train_step(self, input_vector):
        bmu_index = self.get_bmu(input_vector)
        self.update_weights(input_vector, bmu_index)
        return
    

    @tf.function
    def train(self, input_data):
        for _ in range(self.n_iterations):
            for vector in input_data:
                self.train_step(vector)
        return


    def index_to_2d(self, index):
        i = index // self.n
        j = index % self.n
        return (i, j)


    @tf.function
    def activate(self, input_vector):
        distances = tf.reduce_sum(tf.square(tf.cast(self.weights, dtype=tf.float32) - tf.cast(input_vector, dtype=tf.float32)), axis=1)
        return tf.reshape(distances, [self.m, self.n])

---
### **Instinctive Weights**

In [5]:
class InstinctiveWeights:
    def __init__(self, som_shape: Tuple[int, int]):
        self.som_shape = som_shape
        self.num_neurons = tf.reduce_prod(self.som_shape)
        self.weights = tf.Variable(tf.zeros((self.num_neurons, self.num_neurons)), dtype=tf.float32)
        self.decay = 0.999


    @tf.function
    def _neuron_to_position(self, neuron):
        i, j = neuron
        return tf.cast(i * self.som_shape[1] + j, tf.int32)


    @tf.function
    def reinforce_connection(self, n1, n2):
        pos1 = self._neuron_to_position(n1)
        pos2 = self._neuron_to_position(n2)
        value = self.weights[pos1, pos2]
        value += 0.1
        value = tf.clip_by_value(value, 0, 1)
        self.weights[pos1, pos2].assign(value)
        self.weights[pos2, pos1].assign(value)


    @tf.function
    def step(self):
        self.weights.assign(self.weights * self.decay)


    @tf.function
    def get_weight_matrix(self):
        return self.weights

---
### **Instinctive Layer**

In [6]:
class InstinctiveLayer:
    def __init__(
        self, som_shape, func_x_max, func_x_drop, func_y_max, func_y_min, act_threshold,
    ):
        self.som_shape = som_shape
        self.act_threshold = act_threshold
        self.func_x_max = func_x_max
        self.func_x_drop = func_x_drop
        self.func_y_max = func_y_max
        self.func_y_min = func_y_min

        self.charges = tf.Variable(tf.zeros(self.som_shape) + self.func_x_max/100, dtype=tf.float32)


    @tf.function
    def get_act_array(self):
        return tf.reshape(self.get_activations(), [-1])


    @tf.function
    def get_active(self):
        act = self.get_activations()
        active = tf.where(tf.equal(act, 1))
        return [(i, j) for i, j in zip(active[:, 0], active[:, 1])]


    @tf.function
    def step(self, weights, som_act) -> None:        
        charge_delta = tf.reshape(tf.matmul(tf.reshape(self.charges * self.get_activations(), (1, -1)), weights), self.charges.shape)
        charge_delta = tf.clip_by_value(charge_delta, 0, self.func_x_max/50)
        
        som_act *= self.func_x_max/50

        self.charges.assign_add(tf.cast(charge_delta, dtype=tf.float32) + tf.cast(som_act, dtype=tf.float32))

        mask_above_drop = tf.greater(self.charges, self.func_x_drop)
        mask_below_drop = tf.less_equal(self.charges, self.func_x_drop)

        self.charges.assign(tf.where(mask_above_drop, self.charges + self.func_x_max/100, self.charges))
        self.charges.assign(tf.where(mask_below_drop, self.charges - self.func_x_max/100, self.charges))

        self.charges.assign(tf.clip_by_value(self.charges, 0, self.func_x_max))
        return


    @tf.function
    def _apply_activation_function(self, charges):
        def func(x):
            cond1 = tf.less_equal(x, self.func_x_drop)
            cond2 = tf.logical_and(tf.greater(x, self.func_x_drop), tf.less_equal(x, self.func_x_max))
            
            y1 = tf.cast(self.func_y_max / tf.square(self.func_x_drop), dtype=tf.float32) * tf.cast(tf.square(x), dtype=tf.float32)
            y2 = tf.cast(self.func_y_min, dtype=tf.float32) + tf.cast((0 - self.func_y_min) / (self.func_x_max - self.func_x_drop), dtype=tf.float32) * tf.cast(x - self.func_x_drop, dtype=tf.float32)
            
            return tf.where(cond1, y1, tf.where(cond2, y2, tf.zeros_like(x, dtype=tf.float32)))
        
        return func(charges)
    

    @tf.function
    def reset_charges(self):
        self.charges.assign(tf.zeros(self.som_shape) + self.func_x_max/100)
        return
    
    
    @tf.function
    def get_activations(self):
        activations = self._apply_activation_function(self.charges)
        activations = tf.clip_by_value(activations, 0, self.act_threshold)
        activations = activations / self.act_threshold
        activations = tf.where(tf.greater(self.charges, self.func_x_drop), tf.ones_like(activations) * self.func_x_max/100, activations)
        return activations
    
    
    def plot_act_func(self):
        # Generate x values
        x = np.linspace(0, self.func_x_max, 1000)
        y = self._apply_activation_function(x)

        # Plot the function
        plt.plot(x, y, label="Piecewise Function with Repetition")
        plt.xlabel('x')
        plt.ylabel('f(x)')
        plt.title('Piecewise Function: Exponential Growth, Peak, Drop, Linear Growth, and Repeat')
        plt.legend()
        plt.grid(True)
        plt.axhline(y=self.func_y_max, color='gray', linestyle='--')
        plt.axhline(y=self.func_y_min, color='gray', linestyle='--')
        plt.axvline(x=self.func_x_drop, color='gray', linestyle='--')
        plt.axvline(x=self.func_x_max, color='gray', linestyle='--')
        plt.axhline(y = 0, color = 'r', linestyle = '-') 
        plt.axhline(y = self.act_threshold, color = 'r', linestyle = '-') 
        plt.show()
        return

In [7]:
%%script false --no-raise-error

som_shape = (4, 4)
func_x_max = 50
func_x_drop = 30 
func_y_max = 10 
func_y_min = -7 
act_threshold = 5

inst_w = InstinctiveWeights(som_shape)
inst_l= InstinctiveLayer(
    som_shape = som_shape,
    func_x_max = func_x_max, 
    func_x_drop = func_x_drop, 
    func_y_max = func_y_max, 
    func_y_min = func_y_min, 
    act_threshold = act_threshold,
)

inst_l.plot_act_func()


def get_act():
    inst_l.step(inst_w.get_weight_matrix(), np.random.uniform(low=0, high=1, size=som_shape))
    return inst_l.get_activations().numpy(), inst_l.charges.numpy()


_ = [inst_w.reinforce_connection(**connection) 
    for connection in [
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 3)},
        {'n1': (0, 3), 'n2': (0, 0)},
        {'n1': (0, 0), 'n2': (0, 0)},
        {'n1': (0, 0), 'n2': (0, 0)},
        {'n1': (0, 0), 'n2': (0, 1)},
        {'n1': (0, 1), 'n2': (1, 0)},
    ]]
acts = [get_act() for _ in range(100)]

_ = [inst_w.reinforce_connection(**connection) 
    for connection in [
        {'n1': (0, 1), 'n2': (1, 0)},
        {'n1': (0, 1), 'n2': (1, 0)},
        {'n1': (1, 0), 'n2': (2, 2)},
        {'n1': (2, 2), 'n2': (2, 2)},
        {'n1': (2, 2), 'n2': (0, 3)},
        {'n1': (2, 2), 'n2': (0, 3)},
        {'n1': (2, 2), 'n2': (0, 3)},
    ]]
acts.extend([get_act() for _ in range(100)])

arrays = [act[0] for act in acts]

# Determine the number of subplots
num_arrays = len(arrays)
n = arrays[0].shape[0]

# Determine the grid size
cols = int(np.ceil(np.sqrt(num_arrays)))
rows = int(np.ceil(num_arrays / cols))

# Create the subplots
fig, axes = plt.subplots(rows, cols, figsize=(8, 5))

# Plot each array in its subplot
for i, array in enumerate(arrays):
    ax = axes.flat[i]
    cax = ax.matshow(array, cmap='viridis', vmin=0, vmax=1)
    fig.colorbar(cax, ax=ax)
    ax.set_title(f'Array {i + 1}')

# Remove any empty subplots
for j in range(i + 1, len(axes.flat)):
    fig.delaxes(axes.flat[j])

plt.tight_layout()
plt.show()

---
### **Instinctive Network**

In [8]:
class InstinctiveNetwork:

    def __init__(self, som_dims, input_dim, som_kwargs, inst_net_kwargs):
        self.som_dims = som_dims
        self.input_dim = input_dim
        self.som = SOM(*self.som_dims, self.input_dim, **som_kwargs)
        self.inner_weights = InstinctiveWeights(self.som_dims)
        self.inner_layer = InstinctiveLayer(som_shape = self.som_dims, **inst_net_kwargs)
        self.last_winner = tf.Variable([-1, -1], dtype=tf.int32)


    @tf.function
    def train_som(self, data):
        self.som.train(data)
        return


    @tf.function
    def reinforce_connection(self, data):
        som_winner = self.som.get_bmu(data)
        som_winner = self.som.index_to_2d(som_winner)

        valid_last_winner = tf.not_equal(self.last_winner[0], -1)
        
        def reinforce():
            self.inner_weights.reinforce_connection(
                (self.last_winner[0], self.last_winner[1]),
                som_winner
            )
        
        tf.cond(valid_last_winner, reinforce, lambda: None)
        self.last_winner.assign(som_winner)
        return
    

    @tf.function
    def reset_charges(self):
        self.inner_layer.reset_charges()
        self.last_winner.assign([-1, -1])
        return


    @tf.function
    def get_output(self, data, reinforce=True):
        if reinforce:
            self.inner_weights.step()
            self.reinforce_connection(data)

        som_act = self.som.activate(tf.expand_dims(data, 0))
        som_act = 1 - ((som_act - tf.reduce_min(som_act)) / (tf.reduce_max(som_act) - tf.reduce_min(som_act)))
        
        som_act_dist = tf.where(som_act < 0.95, tf.zeros_like(som_act), som_act)

        weights = self.inner_weights.get_weight_matrix()
        self.inner_layer.step(weights, som_act_dist)

        active = self.inner_layer.get_act_array()
        return active, som_act

In [9]:
%%script false --no-raise-error

inst_net = InstinctiveNetwork(
    som_dims = (5, 5), 
    input_dim = 16,
    som_kwargs = {
        'n_iterations': 500,
        'alpha': 0.5,
        'sigma': 3,
    }, 
    inst_net_kwargs = {
        'func_x_max': 50,
        'func_x_drop': 30,
        'func_y_max': 10,
        'func_y_min': -7,
        'act_threshold': 5,
    }
)

data = np.random.rand(100, 16)
inst_net.train_som(data)
out = []

for i in data:
    inst_net.reinforce_connection(i)
    out.append(inst_net.get_output(i))

display(np.shape(out))

# Create the subplots
fig, axes = plt.subplots(10, 10, figsize=(8, 5))

# Plot each array in its subplot
for i, array in enumerate(out):
    ax = axes.flat[i]
    cax = ax.matshow(array[-1].numpy(), cmap='viridis', vmin=0, vmax=1)
    fig.colorbar(cax, ax=ax)
    ax.set_title(f'Array {i + 1}')

# Remove any empty subplots
for j in range(i + 1, len(axes.flat)):
    fig.delaxes(axes.flat[j])

plt.tight_layout()
plt.show()

---
### **Actor**

In [10]:
class Actor(Model):
    def __init__(self, s_inp_dim, s_fc1_dim, con_inp_dim, con_fc1_dim, fc2_dim, fc3_dim, fc4_dim, fc5_dim, out_dim, act_range, lr, tau):
        super(Actor, self).__init__()
        self.act_range = act_range
        self.tau = tau
        
        self.s_fc1 = Dense(s_fc1_dim, activation='relu')
        self.s_bn1 = BatchNormalization()
        
        self.con_fc1 = Dense(con_fc1_dim, activation='relu')
        self.con_bn1 = BatchNormalization()
        
        self.fc2 = Dense(fc2_dim, activation='relu')
        self.bn2 = BatchNormalization()
        
        self.fc3 = Dense(fc3_dim, activation='relu')
        self.bn3 = BatchNormalization()

        self.fc4 = Dense(fc4_dim, activation='relu')
        self.bn4 = BatchNormalization()

        self.fc5 = Dense(fc5_dim, activation='relu')
        self.bn5 = BatchNormalization()
        
        self.out = Dense(out_dim, activation='tanh')
        
        self.optimizer = Adam(learning_rate=lr)


    @tf.function
    def call(self, state, context):
        s = self.s_fc1(state)
        s = self.s_bn1(s)
        
        c = self.con_fc1(context)
        c = self.con_bn1(c)
        
        x = tf.concat([s, c], axis=1)
        
        x = self.fc2(x)
        x = self.bn2(x)
        
        x = self.fc3(x)
        x = self.bn3(x)

        x = self.fc4(x)
        x = self.bn4(x)

        x = self.fc5(x)
        x = self.bn5(x)
        
        x = self.out(x)
        return x * self.act_range


    @tf.function
    def transfer_weights(self, target_model):
        for a, b in zip(target_model.variables, self.variables):
            a.assign(self.tau * b + (1 - self.tau) * a)


---
### **Critic**

In [11]:
class Critic(Model):
    def __init__(self, state_inp_dim, state_fc1_dim, action_inp_dim, action_fc1_dim, conc_fc1_dim, conc_fc2_dim, conc_fc3_dim, conc_fc4_dim, out_dim, lr, tau):
        super(Critic, self).__init__()
        self.tau = tau
        
        self.s_fc1 = Dense(state_fc1_dim, activation='relu')
        self.s_bn1 = BatchNormalization()
        
        self.a_fc1 = Dense(action_fc1_dim, activation='relu')
        self.a_bn1 = BatchNormalization()
        
        self.fc1 = Dense(conc_fc1_dim, activation='relu')
        self.bn1 = BatchNormalization()
        
        self.fc2 = Dense(conc_fc2_dim, activation='relu')
        self.bn2 = BatchNormalization()

        self.fc3 = Dense(conc_fc3_dim, activation='relu')
        self.bn3 = BatchNormalization()

        self.fc4 = Dense(conc_fc4_dim, activation='relu')
        self.bn4 = BatchNormalization()
        
        self.out = Dense(out_dim, activation='linear')
        
        self.optimizer = Adam(learning_rate=lr)

    @tf.function
    def call(self, state, action):
        s = self.s_fc1(state)
        s = self.s_bn1(s)
        
        a = self.a_fc1(action)
        a = self.a_bn1(a)
        
        x = tf.concat([s, a], axis=1)
        
        x = self.fc1(x)
        x = self.bn1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)

        x = self.fc3(x)
        x = self.bn3(x)

        x = self.fc4(x)
        x = self.bn4(x)
        
        x = self.out(x)
        return x


    @tf.function
    def transfer_weights(self, target_model):
        for a, b in zip(target_model.variables, self.variables):
            a.assign(self.tau * b + (1 - self.tau) * a)


---
### **DDPG Agent**

In [12]:
class DDPGAgent(object):
    def __init__(
        self, state_dim, action_dim, action_min, action_max, 
        memory_size, batch_size, gamma, a_lr, c_lr, tau, epsilon, 
        epsilon_decay, epsilon_min, max_steps, env_name
    ):
        
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_min = action_min
        self.action_max = action_max
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.a_lr = a_lr
        self.c_lr = c_lr
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.max_steps = max_steps
        self.env_name = env_name

        self.noise = OUActionNoise(mean=np.zeros(action_dim), sigma=0.5, theta=0.2)

        #Creates the Replay Buffer
        self.memory = ReplayBuffer(self.memory_size, self.batch_size)

        # creates instinctive network
        som_dim = (15, 15)
        self.inst_net = InstinctiveNetwork(
            som_dims = som_dim, 
            input_dim = self.state_dim,
            som_kwargs = {
                'n_iterations': 1,
                'alpha': 1e-3,
                'sigma': 0.7,
            }, 
            inst_net_kwargs = {
                'func_x_max': 50,
                'func_x_drop': 40,
                'func_y_max': 10,
                'func_y_min': -7,
                'act_threshold': 4,
            }
        )

        self.actor, self.actor_target = [Actor(
            s_inp_dim=self.state_dim, 
            s_fc1_dim=512,
            con_inp_dim=np.prod(som_dim), 
            con_fc1_dim=512,
            fc2_dim=2048, 
            fc3_dim=512,
            fc4_dim=256,
            fc5_dim=32,
            out_dim=self.action_dim, 
            act_range=self.action_max, 
            lr=self.a_lr, 
            tau=self.tau,
        ) for _ in range(2)]
        self.actor_target.set_weights(self.actor.get_weights())

        self.critic, self.critic_target = [Critic(
            state_inp_dim=self.state_dim, 
            state_fc1_dim=256, 
            action_inp_dim=self.action_dim, 
            action_fc1_dim=128,
            conc_fc1_dim=512, 
            conc_fc2_dim=256,
            conc_fc3_dim=128,
            conc_fc4_dim=64,
            out_dim=1,
            lr=self.c_lr, 
            tau=self.tau,
        ) for _ in range(2)]
        self.critic_target.set_weights(self.critic.get_weights())
        
        self.create_plot()
        return


    def create_plot(self):
        # Create a figure for SOM activation visualization
        self.fig = plt.figure()

        self.returns = self.fig.add_subplot(221)
        self.returns.title.set_text('Retruns')

        self.n_steps = self.fig.add_subplot(223)
        self.n_steps.title.set_text('N Steps')

        self.som_val = self.fig.add_subplot(222)
        self.som_val.title.set_text('SOM val')

        self.som_act = self.fig.add_subplot(224)
        self.som_act.title.set_text('SOM Activation')

        self.fig.show()
        return


    def update_plots(self, returns=None, n_steps=None, som_val=None, som_act=None):
        # Update the SOM activation plot
        if returns is not None:
            self.returns.plot(np.arange(len(returns)), returns)

        if n_steps is not None:
            self.n_steps.plot(np.arange(len(returns)), n_steps)

        if som_val is not None:
            self.som_val.imshow(som_val)

        if som_act is not None:
            self.som_act.imshow(np.reshape(som_act, np.shape(som_val)))

        self.fig.canvas.draw()
        self.fig.canvas.flush_events()
        return


    @tf.function
    def policy(self, state, explore=True):
        context, som_act = self.inst_net.get_output(state)
        action = self.actor(tf.expand_dims(state, 0), tf.expand_dims(context, 0))[0]

        if explore:
            if tf.random.uniform(()) < self.epsilon:
                noise = self.noise()
                action += noise

        return tf.clip_by_value(action, self.action_min, self.action_max), context, som_act


    @tf.function
    def learn(self, states, actions, rewards, next_states, contexts, next_contexts, done):
        # Train SOM using TensorFlow operations
        all_states = tf.concat([states, next_states], axis=0)
        self.inst_net.train_som(all_states)

        with tf.GradientTape() as tape:
            target_actions = self.actor_target(next_states, next_contexts)
            target_q_values = self.critic_target(next_states, target_actions)
            y = rewards + self.gamma * target_q_values * (1 - done)
            
            q_values = self.critic(states, actions)
            critic_loss = tf.reduce_mean(tf.square(y - q_values))

        critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor(states, contexts)
            critic_value = self.critic(states, actions)
            actor_loss = -tf.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_grad, self.actor.trainable_variables))

        self.actor.transfer_weights(self.actor_target)
        self.critic.transfer_weights(self.critic_target)

        return actor_loss, critic_loss


    def act(self):
        #Reset the envirorment
        env2 = gym.make(self.env_name, hardcore=True, render_mode='human')
        state, _ = env2.reset()
        self.inst_net.reset_charges()
        done = False
        step = 0
        
        while not done:
            env2.render()
            action, context, _ = self.policy(state, explore=False)
            state, _, done, _, _ = env2.step(action.numpy())
            step += 1
            done = done or (step > self.max_steps)
        
        env2.close()
        return


    def train(self, env, num_episodes, verbose, verbose_num, end_on_complete, complete_num, complete_value, act_after_batch, plot_act):
        scores_history = []
        steps_history = []

        print("BEGIN\n")
        complete = 0

        for episode in range(num_episodes):
            state, _ = env.reset()
            self.inst_net.reset_charges()
            done = False
            score = 0
            steps = 0

            while not done:
                action, context, som_val = self.policy(state)
                if plot_act: self.update_plots(som_val=som_val.numpy(), som_act=context.numpy())

                if verbose:
                    print("\r                                                          ", end="")
                    print(f"\rEpisode: {str(episode+1)} \tStep: {str(steps)} \tReward: {str(score)}", end="")
                
                next_state, reward, done, _, _ = env.step(action.numpy())
                _, next_context, _ = self.policy(next_state)
                
                self.memory.append(state, action.numpy(), reward, next_state, context, next_context, done)
                
                if self.memory.isMin():
                    experiences = self.memory.sample()
                    states, actions, rewards, next_states, contexts, next_contexts, dones = [np.array([exp[i] for exp in experiences]) for i in range(7)]
                    
                    self.learn(
                        tf.convert_to_tensor(states, dtype=tf.float32),
                        tf.convert_to_tensor(actions, dtype=tf.float32),
                        tf.convert_to_tensor(rewards, dtype=tf.float32),
                        tf.convert_to_tensor(next_states, dtype=tf.float32),
                        tf.convert_to_tensor(contexts, dtype=tf.float32),
                        tf.convert_to_tensor(next_contexts, dtype=tf.float32),
                        tf.convert_to_tensor(dones, dtype=tf.float32)
                    )
                    self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

                state = next_state
                score += reward
                steps += 1
                done = done or (steps > self.max_steps)


            scores_history.append(score)
            steps_history.append(steps)
            self.update_plots(returns=scores_history, n_steps=steps_history)
            
            if(score >= complete_value):
                complete += 1
                if end_on_complete and complete >= complete_num: break
            
            if((episode+1)%verbose_num == 0):
                print("\r                                                                                                          ", end="")
                print(f'''\rEpisodes: {episode+1}/{num_episodes}\n\tTotal reward: {np.mean(scores_history[-verbose_num:])} +- {np.std(scores_history[-verbose_num:])}\n\tNum. steps: {np.mean(steps_history[-verbose_num:])} +- {np.std(steps_history[-verbose_num:])}\n\tCompleted: {complete}\n--------------------------''')
                if act_after_batch: self.act()
                complete = 0

        print("\nFINISHED")
        
        return scores_history, steps_history


    def save(self, path):
        self.actor.saveModel(path)
        self.critic.saveModel(path)
        return


    def load(self, a_path, c_path):
        self.actor.loadModel(a_path)
        self.critic.loadModel(c_path)
        return


---
### **Test**

In [13]:
name = "BipedalWalker-v3"
env = gym.make(name, hardcore=True)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_min = env.action_space.low
action_max = env.action_space.high

agent = DDPGAgent(
    state_dim = state_dim, 
    action_dim = action_dim, 
    action_min = action_min, 
    action_max = action_max,
    env_name = name,
    memory_size = 1000000,
    batch_size = 256,
    gamma = 0.99,
    a_lr = 3e-5,
    c_lr = 5e-4,
    tau = 1e-4,
    epsilon = 1,
    epsilon_decay = 0.9999,
    epsilon_min = 0.4,
    max_steps = 1600,
)

2024-09-10 16:15:45.028633: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [15]:
agent.train(
    env = env,
    num_episodes = 3000,
    verbose = True,
    verbose_num = 10,
    end_on_complete = True,
    complete_num = 10,
    complete_value = 300,
    act_after_batch = True,
    plot_act = False
)

BEGIN



Episode: 1 	Step: 63 	Reward: -13.794356586412839         

KeyboardInterrupt: 

In [16]:
agent.inst_net.inner_weights.get_weight_matrix().numpy()

array([[7.14895815e-32, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.74162345e-33, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.53059556e-34, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        6.97811946e-36, 4.70963275e-37, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.70963275e-37, 2.31974275e-36, 9.09458667e-37],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.09458667e-37, 1.16931224e-35]], dtype=float32)

In [None]:
agent.act()