## Deep Q-learning: CartPole training
In this environment, the goal is to keep the pole vertically by moving the cart it is in left or right.

![](https://gymnasium.farama.org/_images/cart_pole.gif)

The algorithm has two possible actions: moving left or right. It knows the angle of the pole, and the position and velocity of the cart, which it can use to train the optimal actions. These are continuous states - the position, velocity, and angle can be any real number. 

![](https://aleksandarhaber.com/wp-content/uploads/2023/01/sketch-1-1024x608.png)

In [None]:
## Import Libraries
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
from IPython.display import clear_output, Image, display
from matplotlib import animation

In [None]:
## Initialize the CartPole environment
env = gym.make('CartPole-v1')

In [None]:
## Set up DQN parameters
state_space_size = env.observation_space.shape[0]
action_space_size = env.action_space.n

### Parameters for deep Q-learning 

In [None]:
# Hyperparameters
num_episodes = 200 # number of sequences of states, actions, and rewards
max_steps_per_episode = 500 # max number of steps in a sequence - i.e. number of movements of the cart
learning_rate = 0.001 # learning rate of the model (alpha)
discount_rate = 0.99 # discount rate of the rewards (gamma)
exploration_rate = 1 # exploration rate of the agent (epsilon)
max_exploration_rate = 1 # max exploration rate 
min_exploration_rate = 0.01 # min exploration rate
exploration_decay_rate = 0.001 # decay rate of the exploration rate - epsilon is reduced at each episode
batch_size = 64 # number of samples to train the model at each step
memory_size = 10000 # memory size of the replay buffer
target_update_freq = 10 # frequency of updating the target Q-network

### Define and initialise the Q-neural network 

In [None]:
# Define the Q-network architecture - 3 fully connected layers with ReLU activation

class QNetwork(nn.Module):
    def __init__(self, state_space_size, action_space_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_space_size, 24) # 1st hidden layer with 64 neurons
        self.fc2 = nn.Linear(24, 24) # 2nd hidden layer with 64 neurons
        self.fc3 = nn.Linear(24, action_space_size) # output layer with number of neurons equal to the number of actions

    def forward(self, x):
        x = torch.relu(self.fc1(x)) # ReLU activation function for the 1st hidden layer
        x = torch.relu(self.fc2(x)) # ReLU activation function for the 2nd hidden layer
        x = self.fc3(x) # output layer with no activation function (linear activation)
        return x

In [None]:
model = QNetwork(state_space_size, action_space_size) # Initialize the Q-network
target_model = QNetwork(state_space_size, action_space_size) # Initialize the target Q-network
target_model.load_state_dict(model.state_dict())  # Initialize target model weights
memory = deque(maxlen=memory_size) # Initialize the replay buffer
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Initialize the optimizer
criterion = nn.MSELoss() # Initialize the loss function
rewards_all_episodes = [] # Initialize the list of rewards for all episodes

### Train the deep Q network 

In [None]:
## Training the DQN
def train_dqn():
    global exploration_rate
    global target_model

    for episode in range(num_episodes):
        state = env.reset()
        state = torch.tensor(np.reshape(state, [1, state_space_size]), dtype=torch.float32)
        done = False
        rewards_current_episode = 0

        for step in range(max_steps_per_episode):
            # Exploration-exploitation trade-off
            if np.random.rand() <= exploration_rate:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_values = model(state)
                action = torch.argmax(q_values[0]).item()
            
            next_state, reward, done, _ = env.step(action)
            next_state = torch.tensor(np.reshape(next_state, [1, state_space_size]), dtype=torch.float32)
            reward = torch.tensor([reward], dtype=torch.float32)
            done = torch.tensor([done], dtype=torch.float32)
            memory.append((state, action, reward, next_state, done))
            state = next_state
            rewards_current_episode += reward.item()

            if len(memory) > batch_size:
                minibatch = random.sample(memory, batch_size)
                for state_b, action_b, reward_b, next_state_b, done_b in minibatch:
                    target = reward_b
                    if not done_b:
                        with torch.no_grad():
                            target += discount_rate * torch.max(target_model(next_state_b)[0])
                    target_f = model(state_b)
                    target_f[0][action_b] = target
                    output = model(state_b)
                    loss = criterion(output, target_f)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
            
            if done:
                break

        if episode % target_update_freq == 0:
            target_model.load_state_dict(model.state_dict())  # Update target model

        exploration_rate = min_exploration_rate + \
            (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
        
        rewards_all_episodes.append(rewards_current_episode)



In [None]:
## Run training
train_dqn()

### Check training success

In [None]:
## Calculate and print the average reward per 10 episodes
rewards_per_ten_episodes = np.split(np.array(rewards_all_episodes), num_episodes/10)
count = 10

print("Average reward per thousand episodes")
for r in rewards_per_ten_episodes:
    print(count, ": ", str(sum(r/10)))
    count += 100

In [None]:
## Plotting the results
plt.figure(figsize=(12, 6))
plt.plot(range(num_episodes), rewards_all_episodes)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Reward vs Episode')
plt.show()

In [None]:
## Plot Q-values as a heatmap for a given state
def plot_q_values_heatmap_for_state(model, state):
    with torch.no_grad():
        q_values = model(state).numpy()[0]
    
    plt.figure(figsize=(8, 8))
    sns.heatmap(np.reshape(q_values, (1, -1)), cmap='viridis', annot=True, fmt=".1f", cbar=True)
    plt.title('Q-values for Given State')
    plt.xlabel('Action')
    plt.ylabel('Q-value')
    plt.show()

In [None]:
## Visualize the agent's performance
def visualize_agent_performance(env, model):
    state = env.reset()
    state = torch.tensor(np.reshape(state, [1, state_space_size]), dtype=torch.float32)
    done = False
    frames = []

    for _ in range(max_steps_per_episode):
        frames.append(env.render(mode="rgb_array"))
        with torch.no_grad():
            q_values = model(state)
        action = torch.argmax(q_values[0]).item()
        next_state, reward, done, _ = env.step(action)
        next_state = torch.tensor(np.reshape(next_state, [1, state_space_size]), dtype=torch.float32)
        state = next_state

        if done:
            break

    env.close()
    
    return frames



In [None]:
## Function to create animation using Pillow
def create_animation(frames, filename):
    fig = plt.figure(figsize=(8, 8))
    plt.axis('off')
    patch = plt.imshow(frames[0])

    def update(frame):
        patch.set_data(frame)

    anim = animation.FuncAnimation(fig, update, frames=frames, interval=50)
    anim.save(filename, writer='pillow', fps=30)

In [None]:
## Visualize the agent's performance and create animation
frames = visualize_agent_performance(env, model)
create_animation(frames, 'cartpole_animation.gif')

In [None]:
## Display the saved animation
display(Image(filename='cartpole_animation.gif'))