In [15]:
import gymnasium as gym
import numpy as np
from collections import deque

In [16]:
class Layer:
    """Base class for neural network layers"""
    def __init__(self):
        self.input = None
        self.output = None
    
    def forward(self, input_data):
        """Forward pass - to be implemented by subclasses"""
        raise NotImplementedError
    
    def backward(self, output_gradient, learning_rate):
        """Backward pass - to be implemented by subclasses"""
        raise NotImplementedError

class Activation(Layer):
    """Base activation layer"""
    def __init__(self, activation_func, activation_derivative):
        super().__init__()
        self.activation = activation_func
        self.activation_derivative = activation_derivative
    
    def forward(self, input_data):
        """Forward propagation for activation layer"""
        self.input = input_data
        self.output = self.activation(input_data)
        return self.output
    
    def backward(self, output_gradient, learning_rate):
        """Backward propagation for activation layer"""
        return output_gradient * self.activation_derivative(self.input)

In [17]:
class Dense(Layer):
    """Fully connected (dense) layer"""
    def __init__(self, input_size, output_size):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        
        # Xavier/Glorot initialization
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
        self.biases = np.zeros((1, output_size))
        
        # Store gradients for optimizer
        self.weights_gradient = None
        self.biases_gradient = None
    
    def forward(self, input_data):
        """Forward propagation for dense layer"""
        self.input = input_data
        return np.dot(input_data, self.weights) + self.biases
    
    def backward(self, output_gradient, learning_rate=None):
        """Backward propagation for dense layer"""
        # Compute gradients
        self.weights_gradient = np.dot(self.input.T, output_gradient) / self.input.shape[0]
        self.biases_gradient = np.sum(output_gradient, axis=0, keepdims=True) / self.input.shape[0]
        input_gradient = np.dot(output_gradient, self.weights.T)
        
        if learning_rate is not None:
            self.weights -= learning_rate * self.weights_gradient
            self.biases -= learning_rate * self.biases_gradient
        
        return input_gradient

class ReLU(Activation):
    """ReLU activation layer"""
    def __init__(self):
        def relu(x):
            return np.maximum(0, x)
        
        def relu_derivative(x):
            return (x > 0).astype(float)
        
        super().__init__(relu, relu_derivative)



In [18]:
class Optimizer:
    """Base optimizer class"""
    def __init__(self):
        pass
    
    def update(self, layer):
        """Update layer parameters - to be implemented by subclasses"""
        raise NotImplementedError

class Adam(Optimizer):
    """Adam optimizer"""
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__()
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.t = 0  # time step
        
        # Store moment estimates for each layer
        self.m_weights = {}  # First moment (momentum)
        self.v_weights = {}  # Second moment (RMSprop)
        self.m_biases = {}
        self.v_biases = {}
    
    def update(self, layer):
        """Update parameters using Adam optimizer"""
        if not isinstance(layer, Dense) or layer.weights_gradient is None:
            return
        # Get layer id (use object id as unique identifier)
        layer_id = id(layer)
        
        # Initialize moment estimates if first time
        if layer_id not in self.m_weights:
            self.m_weights[layer_id] = np.zeros_like(layer.weights)
            self.v_weights[layer_id] = np.zeros_like(layer.weights)
            self.m_biases[layer_id] = np.zeros_like(layer.biases)
            self.v_biases[layer_id] = np.zeros_like(layer.biases)
        
        # Increment time step
        self.t += 1
        
        # Update biased first moment estimate for weights
        self.m_weights[layer_id] = (self.beta1 * self.m_weights[layer_id] + 
                                   (1 - self.beta1) * layer.weights_gradient)
        
        # Update biased second moment estimate for weights
        self.v_weights[layer_id] = (self.beta2 * self.v_weights[layer_id] + 
                                   (1 - self.beta2) * (layer.weights_gradient ** 2))
        
        # Update biased first moment estimate for biases
        self.m_biases[layer_id] = (self.beta1 * self.m_biases[layer_id] + 
                                  (1 - self.beta1) * layer.biases_gradient)
        
        # Update biased second moment estimate for biases
        self.v_biases[layer_id] = (self.beta2 * self.v_biases[layer_id] + 
                                  (1 - self.beta2) * (layer.biases_gradient ** 2))
        
        # Compute bias-corrected first moment estimate
        m_weights_corrected = self.m_weights[layer_id] / (1 - self.beta1 ** self.t)
        m_biases_corrected = self.m_biases[layer_id] / (1 - self.beta1 ** self.t)
        
        # Compute bias-corrected second moment estimate
        v_weights_corrected = self.v_weights[layer_id] / (1 - self.beta2 ** self.t)
        v_biases_corrected = self.v_biases[layer_id] / (1 - self.beta2 ** self.t)
        
        # Update parameters
        layer.weights -= (self.learning_rate * m_weights_corrected / 
                         (np.sqrt(v_weights_corrected) + self.epsilon))
        layer.biases -= (self.learning_rate * m_biases_corrected / 
                        (np.sqrt(v_biases_corrected) + self.epsilon))


In [19]:
class NeuralNetwork:
    def __init__(self, in_states, h1_nodes, out_actions):
        self.l1 = Dense(in_states, h1_nodes)
        self.ac1 = ReLU()
        self.l2 = Dense(h1_nodes, out_actions)
        self.ac2 = ReLU()

        self.layers = [self.l1, self.ac1, self.l2, self.ac2]

    def forward(self, x):
        return \
            self.ac2.forward(
                self.l2.forward(
                    self.ac1.forward(
                        self.l1.forward(x)
                    )
                )
            )

    def get_state(self):
        """
        Get the current state of the network including weights, biases, and optimizer state
        
        Returns:
            dict: Dictionary containing all network parameters and optimizer state
        """
        state = {
            'layers': [],
            'optimizer_state': None,
            'optimizer_type': self.optimizer.__class__.__name__,
            'optimizer_params': {}
        }
        
        # Save layer parameters
        for i, layer in enumerate(self.layers):
            if isinstance(layer, Dense):
                layer_state = {
                    'type': 'Dense',
                    'input_size': layer.input_size,
                    'output_size': layer.output_size,
                    'weights': layer.weights.copy(),
                    'biases': layer.biases.copy()
                }
                state['layers'].append(layer_state)
            elif isinstance(layer, Activation):
                layer_state = {
                    'type': layer.__class__.__name__
                }
                state['layers'].append(layer_state)
        
        # Save optimizer parameters
        if hasattr(self.optimizer, 'learning_rate'):
            state['optimizer_params']['learning_rate'] = self.optimizer.learning_rate
        
        if isinstance(self.optimizer, Adam):
            state['optimizer_params'].update({
                'beta1': self.optimizer.beta1,
                'beta2': self.optimizer.beta2,
                'epsilon': self.optimizer.epsilon
            })
            
            # Save Adam optimizer state (moment estimates)
            optimizer_state = {
                't': self.optimizer.t,
                'm_weights': {str(k): v.copy() for k, v in self.optimizer.m_weights.items()},
                'v_weights': {str(k): v.copy() for k, v in self.optimizer.v_weights.items()},
                'm_biases': {str(k): v.copy() for k, v in self.optimizer.m_biases.items()},
                'v_biases': {str(k): v.copy() for k, v in self.optimizer.v_biases.items()}
            }
            state['optimizer_state'] = optimizer_state
        
        return state
    
    def load_state(self, state):
        """
        Load network state from a previously saved state
        
        Args:
            state (dict): State dictionary from get_state()
        """
        # Clear existing layers
        self.layers = []
        
        # Recreate layers from state
        for layer_state in state['layers']:
            if layer_state['type'] == 'Dense':
                layer = Dense(layer_state['input_size'], layer_state['output_size'])
                layer.weights = layer_state['weights'].copy()
                layer.biases = layer_state['biases'].copy()
                self.layers.append(layer)
            elif layer_state['type'] == 'Sigmoid':
                self.layers.append(Sigmoid())
            elif layer_state['type'] == 'ReLU':
                self.layers.append(ReLU())
            elif layer_state['type'] == 'Tanh':
                self.layers.append(Tanh())
        
        # Recreate optimizer
        optimizer_type = state['optimizer_type']
        optimizer_params = state['optimizer_params']
        
        if optimizer_type == 'SGD':
            self.optimizer = SGD(learning_rate=optimizer_params.get('learning_rate', 0.01))
        elif optimizer_type == 'Adam':
            self.optimizer = Adam(
                learning_rate=optimizer_params.get('learning_rate', 0.001),
                beta1=optimizer_params.get('beta1', 0.9),
                beta2=optimizer_params.get('beta2', 0.999),
                epsilon=optimizer_params.get('epsilon', 1e-8)
            )
            
            # Restore Adam optimizer state if available
            if state['optimizer_state'] is not None:
                opt_state = state['optimizer_state']
                self.optimizer.t = opt_state['t']
                
                # Map layer IDs from saved state to current layers
                # We need to create a mapping since object IDs will be different
                dense_layers = [layer for layer in self.layers if isinstance(layer, Dense)]
                saved_layer_ids = list(opt_state['m_weights'].keys())
                
                # Create new moment estimates with current layer IDs
                for i, layer in enumerate(dense_layers):
                    if i < len(saved_layer_ids):
                        saved_id = saved_layer_ids[i]
                        current_id = id(layer)
                        
                        self.optimizer.m_weights[current_id] = opt_state['m_weights'][saved_id].copy()
                        self.optimizer.v_weights[current_id] = opt_state['v_weights'][saved_id].copy()
                        self.optimizer.m_biases[current_id] = opt_state['m_biases'][saved_id].copy()
                        self.optimizer.v_biases[current_id] = opt_state['v_biases'][saved_id].copy()
    
    def clone_to(self, target_network):
        """
        Clone this network's state to another network
        
        Args:
            target_network (NeuralNetwork): Target network to clone to
        """
        state = self.get_state()
        target_network.load_state(state)
        

In [20]:
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)

    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)

In [21]:
class CartPoleDQN():
    buffer_size = 100_000
    learning_rate = 1

    def train(self, episodes):
        env = gym.make("CartPole-v1")

        num_states = env.observation_space.shape[0]
        num_actions = env.action_space.n
        terminated = False
        truncated = False

        cart_pos = np.linspace(env.observation_space.low[0], env.observation_space.high[0], 30)
        cart_vel = np.linspace(env.observation_space.low[1], env.observation_space.high[1], 30)
        pole_angle = np.linspace(env.observation_space.low[2], env.observation_space.high[2], 30)
        pole_vel = np.linspace(env.observation_space.low[3], env.observation_space.high[3], 30)

        epsilon = 1

        self.memory = ReplayMemory(self.buffer_size)
        policy_dqn = NeuralNetwork(num_states, 10, num_actions)
        target_dqn = NeuralNetwork(num_states, 10, num_actions)

        policy_dqn.clone_to(target_dqn)

        self.optimizer = Adam(learning_rate=self.learning_rate)
        
        while not terminated and not truncated:
            new_state, reward, terminated, truncated, _ = env.step(env.action_space.sample())
        env.close()

In [22]:
acrobot = CartPoleDQN()
acrobot.train(episodes=1)

AttributeError: 'NeuralNetwork' object has no attribute 'optimizer'