## simple standalone dqn pong solution using target net

In [1]:
import gym
import collections
import numpy as np

### gym wrappers

In [7]:
class FireResetEnv(gym.Wrapper):
    """
    press FIRE button if reqd and check for corner cases
    """
    def __init__(self, env=None):
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3
        
    def step(self, action):
        return self.env.step(action)
    
    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs

In [8]:
class MaxAndSkipEnv(gym.Wrapper):
    """
    supports making an action decision every K frames
    """
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip
        
    def step(self, action):
        """
        uses same action for K frames and returns results with
        the max valued frame of these frames
        """
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
                
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info
        
    def reset(self):
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

In [9]:
class ProcessFrame84(gym.ObservationWrapper):
    """
    convert emulator observations from RGB 210x160 to grayscale 84x84
    """
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0,
                                               high=255,
                                               shape=(84,84,1),
                                               dtype=np.uint8)
        
    def observation(self, obs):
        return ProcessFrame84.process(obs)
    
    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution"
            
        img = img[:, :, 0]*0.299 + img[:, :, 1]*0.587 + img[:, :, 2]*0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)

In [10]:
class BufferWrapper(gym.ObservationWrapper):
    """
    keeps a stack of the 'n_steps' latest frames as the observation to return
    """
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        self.observation_space = gym.spaces.Box(env.observation_space.low.repeat(n_steps, axis=0),
                                               env.observation_space.high.repeat(n_steps, axis=0),
                                                dtype=dtype)
        
    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())
    
    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer

In [11]:
class ImageToPytorch(gym.ObservationWrapper):
    """
    move image channel to index 0 for pytorch compatibility
    """
    def __init__(self, env):
        super(ImageToPytorch, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0.0,
                                               high=1.0,
                                               shape=(self.observation_space.shape[-1],
                                                     self.observation_space.shape[0],
                                                     self.observation_space.shape[1]),
                                                dtype=np.float32)
        
    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

In [12]:
class ScaledFloatFrame(gym.ObservationWrapper):
    """
    sets array type to float and normalizes pixel values
    """
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

In [13]:
def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPytorch(env)
    env = BufferWrapper(env, 4)
    return ScaledFloatFrame(env)

### training

In [14]:
import cv2

import torch
import torch.nn as nn

In [15]:
class DQN(nn.Module):
    """
    based on DeepMinds architecture for
    Human-Level Control through Deep Reinforcement Learning
    published in Nature
    """
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
                                 nn.ReLU(),
                                 nn.Conv2d(32, 64, kernel_size=4, stride=2),
                                 nn.ReLU(),
                                 nn.Conv2d(64, 64, kernel_size=3, stride=1),
                                 nn.ReLU())
        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(nn.Linear(conv_out_size, 512),
                               nn.ReLU(),
                               nn.Linear(512, n_actions))
    
    def forward(self, x):
        """
        conv -> transform 4d tensor to 2d tensor (where first dim in both is batch size) -> fc
        """
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)
        
    def _get_conv_out(self, shape):
        """
        apply mock conv to get output size
        """
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

In [16]:
import time
import torch.optim as optim

from torch.utils.tensorboard import SummaryWriter

In [17]:
GYM_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.0
GAMMA = 0.99
LEARNING_RATE = 1e-4
BATCH_SIZE = 32 # sampled from replay buffer

REPLAY_MAX_SIZE = 10000
REPLAY_START_SIZE = 10000 # num frames to populate replay buf before starting training
SYNC_TARGET_FRAMES = 1000 # num frames btwn sync training and target models

EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_START = 1.0
EPSILON_END = 0.01

In [18]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward',
                                                              'done', 'new_state'])

class ExperienceBuffer:
    """
    replay buffer of Experiences through games
    """
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)
        
    def __len__(self):
        return len(self.buffer)
    
    def append(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[i] for i in indices])
        return (np.array(states), np.array(actions), np.array(rewards, dtype=np.float32),
               np.array(dones, dtype=np.uint8), np.array(next_states))

In [19]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()
        
    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0
        
    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None
        
        # select action
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_v = torch.tensor(np.array([self.state], copy=False)).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())
        
        # play and save
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward
        
        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [20]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch
    states_v = torch.tensor(np.array(states, copy=False)).to(device)
    next_states_v = torch.tensor(np.array(next_states, copy=False)).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)
    
    # run the sampled batch
    # gather on dim 1 (actions)
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    
    with torch.no_grad():
        next_state_values = tgt_net(next_states_v).max(1)[0]
        # no values for next state on last state
        next_state_values[done_mask] = 0.0
        # detach values from computation graph to prevent gradients
        # from flowing into Q approx (of next state), (tgt) NN
        next_state_values = next_state_values.detach()
    expected_state_action_values = rewards_v + GAMMA*next_state_values
    
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [33]:
env = make_env(GYM_ENV_NAME)
device = "cpu"

# using a main NN instance for current state
# and a target NN instance for next state
# from the DeepMind paper
net = DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
print(net)

writer = SummaryWriter(comment="-dqn-" + GYM_ENV_NAME)

buffer = ExperienceBuffer(REPLAY_MAX_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_i = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


In [34]:
# training loop
while True:
    frame_i +=1
    epsilon = max(EPSILON_END, EPSILON_START - frame_i / EPSILON_DECAY_LAST_FRAME)
    
    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        """
        it was the last step in an epsiode
        """
        total_rewards.append(reward)
        speed = (frame_i - ts_frame) / (time.time() - ts)
        ts_frame = frame_i
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, reward %.3f, eps %.2f, speed %.2f fps" % (frame_i, len(total_rewards),
                                          mean_reward, epsilon, speed))
        writer.add_scalar("reward_mean_100_frames", mean_reward, frame_i)
        writer.add_scalar("reward_latest", reward, frame_i)
        writer.add_scalar("epsilon", epsilon, frame_i)
        writer.add_scalar("speed", speed, frame_i)
        
        if best_mean_reward is None or best_mean_reward < mean_reward:
            # save model params
            torch.save(net.state_dict(), "models/" + GYM_ENV_NAME + "-best_%.0f.dat" % mean_reward)
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames" % frame_i)
            break
    
    # enough transitions in buffer to start training?
    if len(buffer) < REPLAY_START_SIZE:
        continue
    
    # sync models
    if frame_i % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())
        
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()
    
writer.close()

818: done 1 games, reward -21.000, eps 0.99, speed 188.56 fps
1659: done 2 games, reward -20.500, eps 0.99, speed 1059.38 fps
Best mean reward updated -21.000 -> -20.500
2537: done 3 games, reward -20.667, eps 0.98, speed 1082.24 fps
3466: done 4 games, reward -20.500, eps 0.98, speed 1083.76 fps
4335: done 5 games, reward -20.400, eps 0.97, speed 1078.16 fps
Best mean reward updated -20.500 -> -20.400
5355: done 6 games, reward -20.333, eps 0.96, speed 1065.24 fps
Best mean reward updated -20.400 -> -20.333
6312: done 7 games, reward -20.429, eps 0.96, speed 1058.50 fps
7093: done 8 games, reward -20.500, eps 0.95, speed 1061.02 fps
7915: done 9 games, reward -20.556, eps 0.95, speed 1069.04 fps
8982: done 10 games, reward -20.400, eps 0.94, speed 1064.86 fps
10066: done 11 games, reward -20.273, eps 0.93, speed 474.20 fps
Best mean reward updated -20.333 -> -20.273
11080: done 12 games, reward -20.167, eps 0.93, speed 49.19 fps
Best mean reward updated -20.273 -> -20.167
11902: done 

126269: done 106 games, reward -19.040, eps 0.16, speed 45.78 fps
Best mean reward updated -19.120 -> -19.040
128394: done 107 games, reward -18.970, eps 0.14, speed 46.17 fps
Best mean reward updated -19.040 -> -18.970
131136: done 108 games, reward -18.790, eps 0.13, speed 46.58 fps
Best mean reward updated -18.970 -> -18.790
133102: done 109 games, reward -18.720, eps 0.11, speed 46.56 fps
Best mean reward updated -18.790 -> -18.720
136426: done 110 games, reward -18.580, eps 0.09, speed 46.50 fps
Best mean reward updated -18.720 -> -18.580
139436: done 111 games, reward -18.460, eps 0.07, speed 46.10 fps
Best mean reward updated -18.580 -> -18.460
143245: done 112 games, reward -18.250, eps 0.05, speed 45.80 fps
Best mean reward updated -18.460 -> -18.250
146601: done 113 games, reward -18.030, eps 0.02, speed 45.84 fps
Best mean reward updated -18.250 -> -18.030
148611: done 114 games, reward -17.650, eps 0.01, speed 45.30 fps
Best mean reward updated -18.030 -> -17.650
150837: do

279062: done 183 games, reward 8.250, eps 0.01, speed 45.89 fps
Best mean reward updated 7.860 -> 8.250
280749: done 184 games, reward 8.640, eps 0.01, speed 45.58 fps
Best mean reward updated 8.250 -> 8.640
282539: done 185 games, reward 9.020, eps 0.01, speed 46.12 fps
Best mean reward updated 8.640 -> 9.020
284211: done 186 games, reward 9.420, eps 0.01, speed 45.85 fps
Best mean reward updated 9.020 -> 9.420
285912: done 187 games, reward 9.800, eps 0.01, speed 46.21 fps
Best mean reward updated 9.420 -> 9.800
287583: done 188 games, reward 10.200, eps 0.01, speed 45.55 fps
Best mean reward updated 9.800 -> 10.200
289218: done 189 games, reward 10.530, eps 0.01, speed 46.00 fps
Best mean reward updated 10.200 -> 10.530
290853: done 190 games, reward 10.930, eps 0.01, speed 45.94 fps
Best mean reward updated 10.530 -> 10.930
292488: done 191 games, reward 11.310, eps 0.01, speed 45.87 fps
Best mean reward updated 10.930 -> 11.310
294123: done 192 games, reward 11.690, eps 0.01, spee

### play with models

In [21]:
MODEL_FILE = "models/" + GYM_ENV_NAME + "-best_19.dat"
FPS = 30

In [22]:
env = make_env(GYM_ENV_NAME)
#env = gym.wrappers.Monitor(env, directory='monitor', force=True)

net = DQN(env.observation_space.shape, env.action_space.n)
state = torch.load(MODEL_FILE, map_location=lambda stg,_: stg)
net.load_state_dict(state)

state = env.reset()
total_reward = 0.0
c = collections.Counter()

In [23]:
while True:
    start_ts = time.time()
    env.render()
    
    # select action with model
    state_v = torch.tensor(np.array([state], copy=False))
    q_vals = net(state_v).data.numpy()[0]
    action = np.argmax(q_vals)
    c[action] += 1
    
    # play
    state, reward, done, _ = env.step(action)
    total_reward += reward
    if done:
        break
    
    delta = 1/FPS - (time.time() - start_ts)
    if delta > 0:
        time.sleep(delta)
    
print("Total reward: %.2f" % total_reward)
print("Action counts:", c)
env.env.close()

Total reward: 20.00
Action counts: Counter({0: 616, 3: 362, 4: 203, 1: 198, 2: 163, 5: 152})
