# Higher-Level RL Libraries

* Action selectors

In [16]:
import numpy as np
import ptan

q_vals = np.array([[1, 2, 3], [1, -1, 0]])
selector = ptan.actions.ArgmaxActionSelector()
print(selector(q_vals))

[2 0]


In [17]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0) # same as argmax?
print(selector(q_vals))

[2 0]


In [18]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
print(selector(q_vals))

[0 0]


In [19]:
selector = ptan.actions.ProbabilityActionSelector()
for _ in range(10):
    acts = selector(np.array([
        [0.2, 0.3, 0.5],
        [0.0, 0.0, 1.0],
        [0.5, 0.5, 0.0]]))
    print(acts)

[0 2 0]
[1 2 1]
[2 2 1]
[2 2 0]
[2 2 1]
[0 2 1]
[2 2 1]
[2 2 0]
[0 2 0]
[1 2 0]


* DQNAgent

In [20]:
import torch
import torch.nn as nn

class DQNNet(nn.Module):
    def __init__(self, actions: int):
        super(DQNNet, self).__init__()
        self.actions = actions
        
    def forward(self, x):
        return torch.eye(x.size()[0], self.actions)

In [21]:
net = DQNNet(actions=3)
net(torch.zeros(2, 10))

tensor([[1., 0., 0.],
        [0., 1., 0.]])

In [22]:
# simple argmax policy
selector = ptan.actions.ArgmaxActionSelector()
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector) # dqn_model maps the observations, selector selects from the net's output
agent(torch.zeros(2, 5))

(array([0, 1]), [None, None])

In [23]:
# eplison-greedy
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector) # has internal net!
agent(torch.zeros(10,5))[0]

array([2, 2, 1, 0, 1, 1, 0, 2, 1, 2])

In [24]:
selector.epsilon = 0.5 # can change the epsilon on the fly!
agent(torch.zeros(10, 5))[0]

array([0, 1, 2, 0, 0, 0, 0, 0, 0, 1])

* PolicyAgent

In [25]:
class PolicyNet(nn.Module): # This network will produce probability
    def __init__(self, actions: int):
        super(PolicyNet, self).__init__()
        self.actions = actions
    
    def forward(self, x):
        shape = (x.size()[0], self.actions)
        res = torch.zeros(shape, dtype=torch.float32)
        res[:, 0] = 0.5
        res[:, 1] = 0.5
        return res

In [26]:
net = PolicyNet(actions=5)
net(torch.zeros(6, 10))

tensor([[0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]])

In [27]:
# policyAgent in combination with ProbabilityActionSelector!
selector = ptan.actions.ProbabilityActionSelector()
agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True) # what's the deference between PolicyAgent and DQNAgent? -> softmax!!
agent(torch.zeros(6, 5))[0]

array([0, 1, 4, 3, 1, 1])

In [28]:
nn.functional.softmax(net(torch.zeros(1, 10)), dim=1) # non-zero result at 0

tensor([[0.2618, 0.2618, 0.1588, 0.1588, 0.1588]])

* Experience Source

In [29]:
import gym
# implement a very simple gym env

class ToyEnv(gym.Env):
    def __init__(self):
        super(ToyEnv, self).__init__()
        self.observation_space = gym.spaces.Discrete(n=5)
        self.action_space = gym.spaces.Discrete(n=3)
        self.step_index = 0
        pass
    
    def reset(self):
        self.step_index = 0
        return self.step_index
    
    def step(self, action):
        is_done = self.step_index == 10
        if is_done:
            return self.step_index % self.observation_space.n, 0.0, is_done, {}
        
        self.step_index += 1
        return self.step_index % self.observation_space.n, 0.0, is_done, {}

In [30]:
from typing import List, Optional, Tuple, Any

class DullAgent(ptan.agent.BaseAgent):
    def __init__(self, action: int):
        self.action = action
        
    def __call__(self, observations=List[Any], state: Optional[List]=None) -> Tuple[List[int], Optional[List]]:
        return [self.action for _ in observations], state

In [38]:
# ExperienceSource Class
env = ToyEnv()
agent = DullAgent(action=1) # choose 1 for every state -> doesn't require a net
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=2) # batch's size is 2?
for idx, exp in enumerate(exp_source): # provides standard python iterator interface!
    if idx > 12:
        break
    print(exp)

(Experience(state=0, action=1, reward=0.0, done=False), Experience(state=1, action=1, reward=0.0, done=False))
(Experience(state=1, action=1, reward=0.0, done=False), Experience(state=2, action=1, reward=0.0, done=False))
(Experience(state=2, action=1, reward=0.0, done=False), Experience(state=3, action=1, reward=0.0, done=False))
(Experience(state=3, action=1, reward=0.0, done=False), Experience(state=4, action=1, reward=0.0, done=False))
(Experience(state=4, action=1, reward=0.0, done=False), Experience(state=0, action=1, reward=0.0, done=False))
(Experience(state=0, action=1, reward=0.0, done=False), Experience(state=1, action=1, reward=0.0, done=False))
(Experience(state=1, action=1, reward=0.0, done=False), Experience(state=2, action=1, reward=0.0, done=False))
(Experience(state=2, action=1, reward=0.0, done=False), Experience(state=3, action=1, reward=0.0, done=False))
(Experience(state=3, action=1, reward=0.0, done=False), Experience(state=4, action=1, reward=0.0, done=False))
(

In [41]:
exp_source = ptan.experience.ExperienceSource(env=[env, env], agent=agent, steps_count=4) # batch's size is 2?
for idx, exp in enumerate(exp_source): # provides standard python iterator interface!
    if idx > 12:
        break
    print(exp)
    print("---------------")

(Experience(state=0, action=1, reward=0.0, done=False), Experience(state=1, action=1, reward=0.0, done=False), Experience(state=3, action=1, reward=0.0, done=False), Experience(state=0, action=1, reward=0.0, done=False))
---------------
(Experience(state=0, action=1, reward=0.0, done=False), Experience(state=2, action=1, reward=0.0, done=False), Experience(state=4, action=1, reward=0.0, done=False), Experience(state=1, action=1, reward=0.0, done=False))
---------------
(Experience(state=1, action=1, reward=0.0, done=False), Experience(state=3, action=1, reward=0.0, done=False), Experience(state=0, action=1, reward=0.0, done=False), Experience(state=2, action=1, reward=0.0, done=False))
---------------
(Experience(state=2, action=1, reward=0.0, done=False), Experience(state=4, action=1, reward=0.0, done=False), Experience(state=1, action=1, reward=0.0, done=False), Experience(state=3, action=1, reward=0.0, done=False))
---------------
(Experience(state=3, action=1, reward=0.0, done=Fals