# Reinforcement Learning

This Jupyter Notebook is dedicated to exploring the field of reinforcement learning. Reinforcement learning is a subfield of machine learning that focuses on training agents to make sequential decisions in an environment to maximize a reward signal.

In this notebook, we will be using the OpenAI Gym library, which provides a collection of environments for developing and testing reinforcement learning algorithms. Specifically, we will be working with the CartPole environment, where the goal is to balance a pole on a cart by applying appropriate actions.

Throughout the notebook, we will cover various concepts and techniques in reinforcement learning, including random search, Q-learning, and epsilon-greedy policy. We will visualize the training progress, analyze the rewards obtained, and generate an animated GIF of the agent's performance.

Let's dive into the exciting world of reinforcement learning and explore the capabilities of the CartPole environment!

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import imageio

In [None]:
# Create the Cartpole environment
env = gym.make('CartPole-v1', render_mode='rgb_array')

num_states = env.observation_space.shape[0]
num_actions = env.action_space.n
# The four states are: cart position, cart velocity, pole angle, pole angular velocity
# The two actions are: push cart to the left, push cart to the right
print(num_states, num_actions)
print(env.observation_space.high, env.observation_space.low)

## Random Search

In [None]:
def get_action(w, state):
    return int(w @ state >= 0)

weights = np.random.randn(num_states)
state, _ = env.reset()
action = get_action(weights, state)

print('weights:', weights)
print('state:', state)
# an action of 0 means push cart to the left, an action of 1 means push cart to the right
print('action:', action)

In [None]:
def run(env, w):
    state, _ = env.reset()
    rewards = 0
    for _ in range(500):
        state, reward, done, _, _ = env.step(get_action(w, state))
        rewards += reward
        if done:
            break
    return rewards

def random_search(env, num_episodes):
    best_weights = None
    maximum_reward = 0
    rewards = []
    for _ in range(num_episodes):
        weights = np.random.randn(num_states)
        reward = run(env, weights)
        rewards.append(reward)
        if reward > maximum_reward:
            maximum_reward = reward
            best_weights = weights
    return best_weights, rewards


In [None]:
def plot_rewards(rewards):
    plt.plot(rewards)
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('Rewards per Episode')
    plt.show()

In [None]:
# In my experience, the sufficient weights are already gained after 50 episodes
best_weights, rewards = random_search(env, 500)
plot_rewards(rewards)

```python
frames = []
for _ in range(5):
    state, _ = env.reset()
    done = False
    while not done:
        frame = env.render()
        frames.append(frame)
        state, _, done, _, _ = env.step(get_action(best_weights, state))
    imageio.mimsave('images/cartpole_random_search.gif', frames, fps=30)
```

![Cartpole](images/cartpole_random_search.gif)

## Policy Learning

In [None]:
class Fully:
    def __init__(self, neurons, input_length, activation=None):
        self.relu = activation == 'relu'
        self.sigmoid = activation == 'sigmoid'
        self.weights = np.random.randn(neurons, input_length) / np.sqrt(input_length)
        self.weight_update = np.zeros_like(self.weights)

    def forward(self, input):
        self.input = input.copy()
        self.y = np.dot(self.weights, self.input)
        if self.relu:
            self.y[self.y < 0] = 0
        if self.sigmoid:
            self.y = 1.0 / (1.0 + np.exp(-self.y))
        return self.y

    def backward(self, grad, _):
        grad = grad.copy()
        if self.relu:
            grad[self.y <=0] = 0
        if self.sigmoid:
            grad = grad * self.y * (1 - self.y)
        self.weight_update += np.outer(grad, self.input)
        return np.dot(self.weights.T, grad)

    def update_weights(self, lr):
        self.weights -= lr * self.weight_update.copy()
        self.weight_update = np.zeros_like(self.weights)

class Network:
    def __init__(self, topology, learning_rate):
        self.lr = learning_rate
        self.net = topology

    def update_weights(self):
        for layer in self.net: layer.update_weights(self.lr)

    def forward(self, x):
        for layer in self.net: x = layer.forward(x)
        return x

    def backward(self, x, y):
        for layer in self.net: x = layer.forward(x)
        for layer in reversed(self.net): y = layer.backward(y, self.lr)

In [None]:
class Agent():
    def __init__(self, net):
        self.net = net

    def act(self, state):
        prob = self.net.forward(state)
        action = 1 if np.random.rand() < prob else 0
        return action, prob

    # We discount the rewards and normalize them
    # Discounting the rewards is important because we want to give more weight to the rewards that are closer to the present
    def _discount_rewards(self, rewards, gamma=0.99):
        discounted = np.zeros_like(rewards)
        reward = 0
        for t in reversed(range(len(rewards))):
            reward = reward * gamma + rewards[t]
            discounted[t] = reward
        # By normalizing the rewards, we can make the training more stable
        discounted -= np.mean(discounted)
        discounted /= np.std(discounted)
        return discounted

    def train(self, episodes, gamma=0.99):
        total_rewards = []

        for _ in range(episodes):
            error_gradients = []
            episode_states = []
            episode_rewards = []
            done, truncated = False, False

            state, _ = env.reset()
            while not (done or truncated):
                action, prob = self.act(state)
                # We want to encourage the actions that were taken
                error_gradients.append(-(action-prob))
                episode_states.append(state)
                state, reward, done, truncated, _ = env.step(action)
                episode_rewards.append(reward)

            # After every episode, we update the weights
            total_rewards.append(sum(episode_rewards))
            error_gradients = np.vstack(error_gradients) * self._discount_rewards(np.vstack(episode_rewards), gamma)
            for state, gradient in zip(episode_states, error_gradients):
                self.net.backward(state, gradient)
            # We update the weights in a batch
            # This helps to stabilize the training, since the weights are updated less frequently
            # and a single example does not have a big impact on the weights
            self.net.update_weights()

        return total_rewards

In [None]:
topology = [
    Fully(10, 4, activation='relu'),
    Fully(10, 10),
    Fully(1, 10, activation='sigmoid')]
net = Network(topology=topology, learning_rate=1e-2)

In [None]:
agent = Agent(net)
rewards = agent.train(1000)
plot_rewards(rewards)

```python
frames = []
for _ in range(5):
    state, _ = env.reset()
    done = False
    while not done:
        frame = env.render()
        frames.append(frame)
        action, _ = agent.act(state)
        state, _, done, _, _ = env.step(action)
    imageio.mimsave('images/cartpole_policy_learning.gif', frames, fps=30)
```

![Cartpole](images/cartpole_policy_learning.gif)