# Higher-Level RL Libraries

* Action selectors

In [16]:
import numpy as np
import ptan

q_vals = np.array([[1, 2, 3], [1, -1, 0]])
selector = ptan.actions.ArgmaxActionSelector()
print(selector(q_vals))

[2 0]


In [17]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0) # same as argmax?
print(selector(q_vals))

[2 0]


In [18]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
print(selector(q_vals))

[0 0]


In [19]:
selector = ptan.actions.ProbabilityActionSelector()
for _ in range(10):
    acts = selector(np.array([
        [0.2, 0.3, 0.5],
        [0.0, 0.0, 1.0],
        [0.5, 0.5, 0.0]]))
    print(acts)

[0 2 0]
[1 2 1]
[2 2 1]
[2 2 0]
[2 2 1]
[0 2 1]
[2 2 1]
[2 2 0]
[0 2 0]
[1 2 0]


* DQNAgent

In [20]:
import torch
import torch.nn as nn

class DQNNet(nn.Module):
    def __init__(self, actions: int):
        super(DQNNet, self).__init__()
        self.actions = actions
        
    def forward(self, x):
        return torch.eye(x.size()[0], self.actions)

In [21]:
net = DQNNet(actions=3)
net(torch.zeros(2, 10))

tensor([[1., 0., 0.],
        [0., 1., 0.]])

In [22]:
# simple argmax policy
selector = ptan.actions.ArgmaxActionSelector()
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector) # dqn_model maps the observations, selector selects from the net's output
agent(torch.zeros(2, 5))

(array([0, 1]), [None, None])

In [23]:
# eplison-greedy
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector) # has internal net!
agent(torch.zeros(10,5))[0]

array([2, 2, 1, 0, 1, 1, 0, 2, 1, 2])

In [24]:
selector.epsilon = 0.5 # can change the epsilon on the fly!
agent(torch.zeros(10, 5))[0]

array([0, 1, 2, 0, 0, 0, 0, 0, 0, 1])

* PolicyAgent

In [25]:
class PolicyNet(nn.Module): # This network will produce probability
    def __init__(self, actions: int):
        super(PolicyNet, self).__init__()
        self.actions = actions
    
    def forward(self, x):
        shape = (x.size()[0], self.actions)
        res = torch.zeros(shape, dtype=torch.float32)
        res[:, 0] = 0.5
        res[:, 1] = 0.5
        return res

In [26]:
net = PolicyNet(actions=5)
net(torch.zeros(6, 10))

tensor([[0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000]])

In [27]:
# policyAgent in combination with ProbabilityActionSelector!
selector = ptan.actions.ProbabilityActionSelector()
agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True) # what's the deference between PolicyAgent and DQNAgent? -> softmax!!
agent(torch.zeros(6, 5))[0]

array([0, 1, 4, 3, 1, 1])

In [28]:
nn.functional.softmax(net(torch.zeros(1, 10)), dim=1) # non-zero result at 0

tensor([[0.2618, 0.2618, 0.1588, 0.1588, 0.1588]])

* Experience Source

In [45]:
import gym
# implement a very simple gym env

class ToyEnv(gym.Env):
    def __init__(self):
        super(ToyEnv, self).__init__()
        self.observation_space = gym.spaces.Discrete(n=5)
        self.action_space = gym.spaces.Discrete(n=3)
        self.step_index = 0
        pass
    
    def reset(self):
        self.step_index = 0
        return self.step_index
    
    def step(self, action):
        is_done = self.step_index == 10
        if is_done:
            return self.step_index % self.observation_space.n, 1.0, is_done, {}
        
        self.step_index += 1
        return self.step_index % self.observation_space.n, 1.0, is_done, {}

In [46]:
from typing import List, Optional, Tuple, Any

class DullAgent(ptan.agent.BaseAgent):
    def __init__(self, action: int):
        self.action = action
        
    def __call__(self, observations=List[Any], state: Optional[List]=None) -> Tuple[List[int], Optional[List]]:
        return [self.action for _ in observations], state

In [47]:
# ExperienceSource Class
env = ToyEnv()
agent = DullAgent(action=1) # choose 1 for every state -> doesn't require a net
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=2) # batch's size is 2?
for idx, exp in enumerate(exp_source): # provides standard python iterator interface!
    if idx > 12:
        break
    print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(Experience(state=4, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(

In [48]:
exp_source = ptan.experience.ExperienceSource(env=[env, env], agent=agent, steps_count=4) # batch's size is 2?
for idx, exp in enumerate(exp_source): # provides standard python iterator interface!
    if idx > 12:
        break
    print(exp)
    print("---------------")

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
---------------
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
---------------
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
---------------
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
---------------
(Experience(state=3, action=1, reward=1.0, done=Fals

* ExperienceSourceFirstLast

In [49]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count = 1)

for idx, exp in enumerate(exp_source):
    print(exp)
    if idx > 10:
        break

ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)


In [51]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count = 2) 

for idx, exp in enumerate(exp_source):
    print(exp)
    if idx > 10:
        break
        
# only the first action is shown?

ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=0)
ExperienceFirstLast(state=4, action=1, reward=2.0, last_state=1)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=0)
ExperienceFirstLast(state=4, action=1, reward=2.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)


* Experience Replay Buffers

In [76]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSourceFirstLast([env, env], agent,gamma=1.0 , steps_count=3) # put in a lot of envs, they all work the same
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=101)

In [77]:
len(buffer)

0

In [78]:
buffer.populate(120)
for step in range(1):
    buffer.populate(1)
    if len(buffer) < 5:
        continue
    batch = buffer.sample(4)
    
    for s in batch:
        print(s)

ExperienceFirstLast(state=2, action=1, reward=3.0, last_state=3)
ExperienceFirstLast(state=0, action=1, reward=3.0, last_state=None)
ExperienceFirstLast(state=3, action=1, reward=3.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=3.0, last_state=3)


* TargetNet class

In [79]:
class DQNNet(nn.Module):
    def __init__(self):
        super(DQNNet, self).__init__()
        self.ff = nn.Linear(5, 3)
        
    def forward(self, x):
        return self.ff(x)

In [80]:
net = DQNNet()
net

DQNNet(
  (ff): Linear(in_features=5, out_features=3, bias=True)
)

In [81]:
tgt_net = ptan.agent.TargetNet(net) # copy net!

In [82]:
net.ff.weight

Parameter containing:
tensor([[ 0.0901, -0.2835, -0.4371,  0.0918, -0.4165],
        [-0.2148, -0.4259, -0.2545,  0.2414, -0.2175],
        [-0.3484,  0.3733, -0.4437,  0.1449, -0.3311]], requires_grad=True)

In [83]:
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[ 0.0901, -0.2835, -0.4371,  0.0918, -0.4165],
        [-0.2148, -0.4259, -0.2545,  0.2414, -0.2175],
        [-0.3484,  0.3733, -0.4437,  0.1449, -0.3311]], requires_grad=True)

In [84]:
net.ff.weight.data += 1.0
print(net.ff.weight)

Parameter containing:
tensor([[1.0901, 0.7165, 0.5629, 1.0918, 0.5835],
        [0.7852, 0.5741, 0.7455, 1.2414, 0.7825],
        [0.6516, 1.3733, 0.5563, 1.1449, 0.6689]], requires_grad=True)


In [85]:
tgt_net.target_model.ff.weight 
# independent

Parameter containing:
tensor([[ 0.0901, -0.2835, -0.4371,  0.0918, -0.4165],
        [-0.2148, -0.4259, -0.2545,  0.2414, -0.2175],
        [-0.3484,  0.3733, -0.4437,  0.1449, -0.3311]], requires_grad=True)

In [87]:
tgt_net.sync() # sync two models
print(tgt_net.target_model.ff.weight)

Parameter containing:
tensor([[1.0901, 0.7165, 0.5629, 1.0918, 0.5835],
        [0.7852, 0.5741, 0.7455, 1.2414, 0.7825],
        [0.6516, 1.3733, 0.5563, 1.1449, 0.6689]], requires_grad=True)


## The PTAN CartPole solver

In [88]:
import gym
import ptan
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [89]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
TGT_NET_SYNC = 10
GAMMA = 0.9
REPLAY_SIZE = 1000
LR = 1e-3
EPS_DECAY = 0.99

In [90]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
        
    def forward(self, x):
        return self.net(x.float())

In [92]:
@torch.no_grad()
def unpack_batch(batch, net, gamma): # reform batch for better calculation of loss
    states = []
    actions = []
    rewards = []
    done_masks = []
    last_states = []
    
    for exp in batch:
        states.append(exp.state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        done_masks.append(exp.last_state is None)
        
        if exp.last_state is None:
            last_states.append(exp.state) # need something of same size to put in
        else:
            last_states.append(exp.last_state)
    
    states_v = torch.tensor(states)
    actions_v = torch.tensor(actions)
    rewards_v = torch.tensor(rewards)
    last_states_v = torch.tensor(last_states)
    
    last_state_q_v = net(last_states_v)
    best_last_q_v = torch.max(last_state_q_v, dim=1)[0] # get the reward
    best_last_q_v[done_masks] = 0.0 # mask the ended reward
    return states_v, actions_v, best_last_q_v * gamma + rewards_v #Q(s, a) should be the third arg

In [100]:
env = gym.make("CartPole-v0")
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

# create data pipeline
net = Net(obs_size, HIDDEN_SIZE, n_actions)
tgt_net = ptan.agent.TargetNet(net) #create target net
selector = ptan.actions.ArgmaxActionSelector()
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1, selector=selector) # nest selectors like this!!
agent = ptan.agent.DQNAgent(net, selector)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=1)
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)

optimizer = optim.Adam(net.parameters(), LR)

In [101]:
step = 0
episode = 0
solved = False

while True:
    step += 1
    buffer.populate(1)
    
    for reward, steps in exp_source.pop_rewards_steps(): # information about the ended episodes are saved
        episode += 1
        print("%d: episode %d done, reward=%.3f, epsilon=%.2f" %(step, episode, reward, selector.epsilon))
        solved = reward > 150 # check whether learning is completed
        
    if solved: 
        print("Congrats!")
        break
        
    if len(buffer) < 2 * BATCH_SIZE:
        continue
        
    batch = buffer.sample(BATCH_SIZE)
    states_v, actions_v, tgt_q_v = unpack_batch(batch, tgt_net.target_model, GAMMA)
    optimizer.zero_grad()
    q_v = net(states_v)
    q_v = q_v.gather(1, actions_v.unsqueeze(-1)).squeeze(-1) # pick q(s,a) for only the a taken
    loss_v = F.mse_loss(q_v, tgt_q_v)
    loss_v.backward()
    optimizer.step()
    selector.epsilon *= EPS_DECAY # decay
    
    if step % TGT_NET_SYNC == 0:
        tgt_net.sync() # sync nets
        

16: episode 1 done, reward=15.000, epsilon=1.00
50: episode 2 done, reward=34.000, epsilon=0.83
72: episode 3 done, reward=22.000, epsilon=0.67
86: episode 4 done, reward=14.000, epsilon=0.58
97: episode 5 done, reward=11.000, epsilon=0.52
125: episode 6 done, reward=28.000, epsilon=0.39
141: episode 7 done, reward=16.000, epsilon=0.33
154: episode 8 done, reward=13.000, epsilon=0.29
164: episode 9 done, reward=10.000, epsilon=0.27
173: episode 10 done, reward=9.000, epsilon=0.24
182: episode 11 done, reward=9.000, epsilon=0.22
192: episode 12 done, reward=10.000, epsilon=0.20
201: episode 13 done, reward=9.000, epsilon=0.18
210: episode 14 done, reward=9.000, epsilon=0.17
224: episode 15 done, reward=14.000, epsilon=0.15
233: episode 16 done, reward=9.000, epsilon=0.13
243: episode 17 done, reward=10.000, epsilon=0.12
255: episode 18 done, reward=12.000, epsilon=0.11
264: episode 19 done, reward=9.000, epsilon=0.10
275: episode 20 done, reward=11.000, epsilon=0.09
285: episode 21 done