# Deep Q-Networks

* Tabular Q-Learning

In [1]:
import gym
import collections
from tensorboardX import SummaryWriter

ENV_NAME = "FrozenLake-v0"
GAMMA = 0.9
ALPHA = 0.2 # ema learning rate
TEST_EPISODES = 20

In [2]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.state = self.env.reset()
        self.values = collections.defaultdict(float) # no reward table or counter for est
        
    def sample_env(self):
        action = self.env.action_space.sample()
        old_state = self.state
        new_state, reward, is_done, _ = self.env.step(action)
        self.state = self.env.reset() if is_done else new_state
        return old_state, action, reward, new_state # return tuple
    
    def best_value_and_action(self, state):
        best_value, best_action = None, None
        for action in range(self.env.action_space.n):
            action_value= self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_value, best_action # return the best value and action for certain state 
    
    def value_update(self, old_state, action, reward, next_state):
        best_value, _ = self.best_value_and_action(next_state) # get the value of next-state
        new_value = reward + GAMMA * best_value # newly computed value of old_state
        old_value = self.values[(old_state, action)] # for ema
        self.values[(old_state, action)] = old_value * (1-ALPHA) + new_value * ALPHA
        
    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset() # new env for testing
        while True:
            _, action = self.best_value_and_action(state) # the best action to take in the state
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward

In [3]:
test_env = gym.make(ENV_NAME)
agent = Agent()
writer = SummaryWriter(comment="-q-learning")

iter_no = 0
best_reward = 0.0

while True:
    iter_no += 1
    s, a, r, next_s = agent.sample_env()
    agent.value_update(s, a, r, next_s) # update with one step -> no cache
    
    reward = 0.0
    for _ in range(TEST_EPISODES):
        reward += agent.play_episode(test_env)
    reward /= TEST_EPISODES
    writer.add_scalar("reward", reward, iter_no)
    if reward > best_reward:
        print("Best reward updated %.3f -> %.3f"%(best_reward, reward))
        best_reward = reward
    if reward > 0.8:
        print("Solved in %d iterations"%iter_no)
        break

writer.close()

Best reward updated 0.000 -> 0.050
Best reward updated 0.050 -> 0.100
Best reward updated 0.100 -> 0.150
Best reward updated 0.150 -> 0.200
Best reward updated 0.200 -> 0.300
Best reward updated 0.300 -> 0.400
Best reward updated 0.400 -> 0.450
Best reward updated 0.450 -> 0.500
Best reward updated 0.500 -> 0.600
Best reward updated 0.600 -> 0.750
Best reward updated 0.750 -> 0.800


KeyboardInterrupt: 

## DQN on Pong

* Wrappers

In [None]:
# implementation of wrappers.py

import cv2
import gym
import gym.spaces
import numpy as np 
import collections

In [None]:
class FireResetEnv(gym.Wrapper): # press fire at the beginning of episode
    def __init__(self, env=None): 
        super(FireResetEnv, self).__init__(env) # pass self.env env
        assert env.unwrapped.get_action_meanings()[1] == "FIRE" # check if the buttons are right
        assert len(env.unwrapped.get_action_meanings()) > 3 # have more than 3 buttons?
        pass
    
    def step(self, action):
        return self.env.step(action) # just pass
    
    def reset(self): # on reset
        self.env.reset()
        obs, _, done, _ = self.env.step(1) # press 1
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2) # also need to press 2!
        if done:
            self.env.reset()
        return obs

In [None]:
class MaxAndSkipEnv(gym.Wrapper): # skips frames and obs is max to prevent flickering
    def __init__(self, env = None, skip=4):
        # return only every "skip"-th frame
        super(MaxAndSkipEnv, self).__init__(env)
        self._obs_buffer = collections.deque(maxlen=2) # most recent observations
        self._skip = skip
        pass
    
    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
                
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info
    
    def reset(self): # clear buffer and init
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs) # save the first obs!
        return obs
        

In [None]:
class ProcessFrame84(gym.ObservationWrapper): # resize and grayscale image
    def __init__(self, env = None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        pass
    
    def observation(self, obs):
        return ProcessFrame84.process(obs) # do not use self.process?
    
    @staticmethod
    def process(frame):
        if frame.size == 210*160*3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250*160*3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1]*0.587 + img[:, :, 2]*0.114 # grayscale formula
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :] # crop height
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)

In [None]:
class BufferWrapper(gym.ObservationWrapper): # give buffered past observations as output
    def __init__(self, env, n_steps, dtype=np.float32): # buffer for n_steps
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(
            old_space.low.repeat(n_steps, axis=0),
            old_space.high.repeat(n_steps, axis=0), dtype=dtype)
        pass
    
    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype) # fill the buffer
        return self.observation(self.env.reset())
    
    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:] # just like a deque buffer
        self.buffer[:-1] = observation
        return self.buffer

In [None]:
class ImageToPytorch(gym.ObservationWrapper): # HWC to CHW required by pytorch
    def __init__(self, env):
        super(ImageToPytorch, self).__init__(env)
        old_shape = self.observation_space.shape
        new_shape = (old_shape[-1], old_shape[0], old_shape[1])
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=new_shape, dtype=np.float32)
        pass
    
    def observation(self, observation):
        return np.moveaxis(observation, 2,0)

In [None]:
class ScaledFloatFrame(gym.ObservationWrapper): # buffer to float , 0~1
    def observation(self, observation):
        return np.array(observation).astype(np.float32) / 255.0

In [None]:
def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPytorch(env)
    env = BufferWrapper(env, 4)
    return ScaledFloatFrame(env)

* The DQN model

In [None]:
import torch
import torch.nn as nn
import numpy as np

class DQN(nn.Module):
    def __init__(self, input_shape, n_actions): # input shape doesn't regard batch size!
        super(DQN, self).__init__()
        
        self.conv = nn.Sequential(
            # input channel_n, output_channel_n, filter size, stride
            nn.Conv2d(input_shape[0], 32, kernel_size = 8, stride = 4), 
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512), # input size, output size
            nn.ReLU(),
            nn.Linear(512, n_actions) # get Q(s,a)
        )
        
    def _get_conv_out(self, shape): # get output shape of conv net
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size())) # get the total number of pixels
    
    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1) # what does this do? -> flatten!
        return self.fc(conv_out)

#### Training

In [None]:
import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter

In [None]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19

GAMMA = 0.99 # gamma value
BATCH_SIZE = 512
REPLAY_SIZE = 100000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 10000 # sync between target
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01

Experience = collections.namedtuple("Experience", field_names=["state", "action", "reward",
                                                               "done", "new_state"])
# experience buffer to save

In [None]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size,
                                   replace=False)
        states, actions, rewards, dones, next_states = \
            zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), \
               np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), \
               np.array(next_states)

In [None]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward,
                         is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [None]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(np.array(
        states, copy=False)).to(device)
    next_states_v = torch.tensor(np.array(
        next_states, copy=False)).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_state_values = tgt_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + \
                                   rewards_v
    return nn.MSELoss()(state_action_values,
                        expected_state_action_values)

In [None]:
# main
device = torch.device("cuda")
env_name = DEFAULT_ENV_NAME
env = make_env(env_name)

net = DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device)

writer = SummaryWriter(comment="-"+env_name)
print(net)

In [None]:
buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_m_reward = None

while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - (frame_idx-REPLAY_START_SIZE) / EPSILON_DECAY_LAST_FRAME) # use max!!
    
    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None: # episode is over
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts) # frames per sec
        ts_frame = frame_idx
        ts = time.time()
        m_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, reward %.3f, "
              "eps %.2f, speed %.2f f/s" % (
                  frame_idx, len(total_rewards), m_reward, epsilon, speed))
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", m_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)
        
        if best_m_reward is None or best_m_reward < m_reward:
            torch.save(net.state_dict(), env_name+"-best_%.0f.dat" % m_reward)
            if best_m_reward is not None:
                print("Best reward updated %.3f -> %.3f" %(best_m_reward, m_reward))
            best_m_reward = m_reward
            
        if m_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames!: " % frame_idx)
            break
            
    if len(buffer) < REPLAY_START_SIZE: # not enough buffer
        continue
        
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict()) # sync two nets
    
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE) # train with every new steps
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()
    
writer.close()
    