This notebook explores the OpenAI GYM CartPole environment

The action space has two actions in a nd array, with shape (1, ), that can take on values in {0, 1}

0 - push cart left

1 - push cart right

The observation has four values:

| Num | Observation           | Min                 | Max               |
|-----|-----------------------|---------------------|-------------------|
| 0   | Cart Position         | -4.8                | 4.8               |
| 1   | Cart Velocity         | -Inf                | Inf               |
| 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
| 3   | Pole Angular Velocity | -Inf                | Inf               |

In [28]:
%pip install gym
%pip install gym[classic_control]
%pip install tensorflow
%pip install keras-rl2


[notice] A new release of pip available: 22.1.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.


In [1]:
def randomActions(env):
    for episode in range(10):
        env.reset()
        env.render()
        terminated = False
        score = 0

        while not terminated:
            action = env.action_space.sample()
            _, reward, terminated, _, _ = env.step(action)
            score += reward
        
        print('Episode: {}, Score: {}'.format(episode, score))

    env.close()

In [2]:
def slightlySmart(env):
    for episode in range(10):
        env.reset()
        env.render()
        terminated = False
        score = 0

        action = 0
        while not terminated:
            observation, reward, terminated, _, _ = env.step(action)
            if observation[2] < 0:
                action = 0
            else:
                action = 1
            score += reward
        
        print('Episode: {}, Score: {}'.format(episode, score))

    env.close()

In [3]:
import gym

env = gym.make("CartPole-v1", render_mode="human")
states = env.observation_space.shape[0]
actions = env.action_space.n

Keras Agent

In [4]:
import numpy as np
import tensorflow as tf

In [5]:
def build_model(states, actions):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(24, input_dim=states, activation='relu'))
    model.add(tf.keras.layers.Dense(24, activation='relu'))
    model.add(tf.keras.layers.Dense(actions, activation='linear'))
    return model

In [8]:
model = build_model(states, actions)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 24)                120       
                                                                 
 dense_2 (Dense)             (None, 24)                600       
                                                                 
 dense_3 (Dense)             (None, 2)                 50        
                                                                 
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [9]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory 

In [10]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [11]:
agent = build_agent(model, actions)
opt = tf.keras.optimizers.Adam
opt._name='Adam'
agent.compile(opt(learning_rate=1e-3), metrics=['mae'])
agent.fit(env, nb_steps=50000, visualize=False, verbose=1)

AttributeError: 'Sequential' object has no attribute '_compile_time_distribution_strategy'

PyTorch DQN Agent

In [6]:
from collections import deque
import random
from scipy.interpolate import interp1d
import torch
import torch.nn as nn
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=capacity)
    
    def update(self, transition):
        # transition[0] = state at time t
        # transition[1] = action at time t
        # transition[2] = reward at time t
        # transition[3] = state at time t + 1
        self.memory.append(transition)
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

In [8]:
class ActionObservationSequence:

    # Params:
    # initial_observation - initial observation of the environment (x_1)
    # max_steps - limit at which point we end the sequence
    def __init__(self, initial_observation, max_steps):
        self.sequence = deque(maxlen=max_steps)
        self.sequence.append(initial_observation)
    
    def update(self, observation):
        self.sequence.append(observation)

In [9]:
class DQN(nn.Module):

    def __init__(self, states, actions):
        super().__init__()
        self.layer1 = nn.Linear(states, 24)
        self.layer2 = nn.Linear(24, 48)
        self.layer3 = nn.Linear(48, 24)
        self.layer4 = nn.Linear(24, actions)
    
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        return x
    
    def train

In [10]:
class ExplorationScheduler():

    def __init__(self, epsilon_initial, epsilon_final, num_episodes):
        self.scheduler = interp1d([1, num_episodes], [epsilon_initial, epsilon_final])
    
    # as the current episode increases, epsilon increases from epsilon initial to epsilon final
    def getEpsilon(self, curr_episode):
        return float(self.scheduler(curr_episode))

In [11]:
import gym

env = gym.make("CartPole-v1", render_mode="human")
states = env.observation_space.shape[0]
actions = env.action_space.n

model = DQN(states, actions)

memory_capacity = 1000
num_episodes = 100
max_seq_len = 512

epsilon_initial = 0.20
epislon_final = 0.95
ex_scheduler = ExplorationScheduler(epsilon_initial=epsilon_initial, epsilon_final=epislon_final, num_episodes=num_episodes)

state, _ = env.reset()

for episode in range(1, num_episodes):
    aos = ActionObservationSequence(state, max_steps=max_seq_len)
    if random.random() > ex_scheduler.getEpsilon(episode): # Explore, choose random action
        action = env.action_space.sample()
    else: # Exploit, choose perceived best action
        with torch.no_grad():
            # Convert state to torch tensor
            stateTensor = torch.Tensor(state)
            
            # Pass state through model to get best action
            modelOutput = model.forward(stateTensor)

            # Get maximum value from the model output (predicted best action)
            action = torch.argmax(modelOutput).item()

    next_state, reward, is_done, _, _ = env.step(action)

    



  logger.warn(


: 