DEEP REINFORCEMENT LEARNING EXPLAINED - 15 - 16 - 17
# **Deep Q-Network (DQN)**

OpenAI Pong

In [1]:
import gym
import gym.spaces

DEFAULT_ENV_NAME = "PongNoFrameskip-v4" 
test_env = gym.make(DEFAULT_ENV_NAME)
print(test_env.action_space.n)

6


In [2]:
print(test_env.unwrapped.get_action_meanings())

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


In [3]:
print(test_env.observation_space.shape)

(210, 160, 3)



Type of hardware accelerator provided by Colab

In [4]:
!nvidia-smi 

Tue Mar  1 11:46:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 497.09       Driver Version: 497.09       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A    0C    P8    N/A /  N/A |     37MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import warnings
warnings.filterwarnings('ignore')

## OpenAI Gym Wrappers

In [6]:
# Taken from 
# https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py

import cv2
import numpy as np
import collections

class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def step(self, action):
        return self.env.step(action)

    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], 
                                old_shape[0], old_shape[1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

def make_env(env_name):
    env = gym.make(env_name)
    env = MaxAndSkipEnv(env)
    env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    return ScaledFloatFrame(env)

## The DQN model


In [7]:
import torch
import torch.nn as nn        # Pytorch neural network package
import torch.optim as optim  # Pytorch optimization package
import torch.nn.functional as F

device = torch.device("cuda")
BVAEtoDevice = False #should prepro happen on GPU or CPU

In [8]:
# Taken from 
# https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/dqn_model.py

import numpy as np
features = 64

class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()

   #     self.conv = nn.Sequential(
   #         nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
   #         nn.ReLU(),
   #         nn.Conv2d(32, 64, kernel_size=4, stride=2),
   #         nn.ReLU(),
   #         nn.Conv2d(64, 64, kernel_size=3, stride=1),
   #         nn.ReLU()
   #     )
        
        
        #conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            #nn.Linear(conv_out_size, 512),
            nn.Linear(features*2*4, 512), #buffer von 4 Bilder nacheinander als input (features*2 kommt vom training vom VAE)
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    #def _get_conv_out(self, shape):
    #    o = self.conv(torch.zeros(1, *shape))
    #    return int(np.prod(o.size()))

    def forward(self, x):
        #conv_out = self.conv(x).view(x.size()[0], -1)
        #return self.fc(conv_out)
        #print(x.size())
        return self.fc(x)
        

In [9]:
test_env = make_env(DEFAULT_ENV_NAME)
test_net = DQN(test_env.observation_space.shape, test_env.action_space.n).to(device)
print(test_net)

DQN(
  (fc): Sequential(
    (0): Linear(in_features=512, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


Load pretrained BVAE

In [10]:
features = 64
# define a simple linear VAE #until now normal VAE without Beta
class LinearVAE(nn.Module):
    def __init__(self):
        super(LinearVAE, self).__init__()
 
        # encoder 84*84 = 7’056
        self.enc0 = nn.Linear(in_features=84*84, out_features=1024)
        self.enc1 = nn.Linear(in_features=1024, out_features=512)
        self.enc2 = nn.Linear(in_features=512, out_features=features*2)
 
        # decoder 
        self.dec0 = nn.Linear(in_features=features, out_features=512)
        self.dec1 = nn.Linear(in_features=512, out_features=1024)
        self.dec2 = nn.Linear(in_features=1024, out_features=84*84)

    def reparameterize(self, mu, log_var):
        """
        :param mu: mean from the encoder's latent space
        :param log_var: log variance from the encoder's latent space
        """
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling as if coming from the input space
        return sample
 
 #   def forward(self, x):
 #       # encoding
 #       x = F.relu(self.enc0(x))
 #       x = F.relu(self.enc1(x))

 #       x = self.enc2(x).view(-1, 2, features)

        # get `mu` and `log_var`
 #       mu = x[:, 0, :] # the first feature values as mean
 #       log_var = x[:, 1, :] # the other feature values as variance

        # get the latent vector through reparameterization
 #       z = self.reparameterize(mu, log_var)
 
        # decoding
 #       x = F.relu(self.dec0(z))
 #       x = F.relu(self.dec1(x))
 #       reconstruction = torch.sigmoid(self.dec2(x))
 #       return reconstruction, mu, log_var
    
    def encode(self, x):
        x = F.relu(self.enc0(x))
        x = F.relu(self.enc1(x))
        x = self.enc2(x)
        return x

In [11]:
BVAE = LinearVAE()
BVAE.load_state_dict(torch.load('C:/Users/erics/Documents/Programme/Bachelorarbeit/models/BVAE_Pong/B=10VAEFEB25'))
if BVAEtoDevice == True:
    BVAE.to(device)

## Training

Load Tensorboard extension

In [12]:
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

Import required modules and define the hyperparameters

In [13]:
import time
import numpy as np
import collections

VISUALIZEtraining = True
MEAN_REWARD_BOUND = -19   #Change to 19.0    

gamma = 0.99                   
batch_size = 32                
replay_size = 10000            
learning_rate = 1e-4           
sync_target_frames = 1000      
replay_start_size = 10000      

eps_start=1.0
eps_decay=.999985
eps_min=0.02

Experience replay buffer

In [14]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceReplay:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)


Agent

In [15]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, BVAE, net, epsilon=0.0, device="cpu"):
        
        if VISUALIZEtraining:
            env.render()
            
        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a)
            if BVAEtoDevice == True:
                state_v = state_v.to(device)
            #print(state_v.size(1)) # buffersize
            state_v = BVAE.encode((state_v[0]).view(state_v.size(1), -1)) #preprocess with beta vae with bunch of 4
            state_v = state_v.view(1, -1)
          #  print("state_v size: ")
          #  print(state_v.size())
            
            if BVAEtoDevice == False:
                state_v = state_v.to(device)
            q_vals_v = net(state_v)
           # print("q_vals_v size: " ) 
           # print(q_vals_v.size())
            
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward


In [16]:
def preproBVAE(states_TOpreprocess):
    #print("states_TOpreprocess size: ")
    #print(states_TOpreprocess.size())

    for i in range(states_TOpreprocess.size(0)):
        states_processing = states_TOpreprocess[i]
        #print("states_processing size: ")
        #print(states_processing.size())

        #print(i)

        #print("viewed: ")
        #print(states_processing.view(states_processing.size(0), -1).size())
        temp = BVAE.encode(states_processing.view(states_processing.size(0), -1)) #preprocess with beta vae with bunch of 4
        temp = temp[None, :] #expand by an axis [1, 128]
        try:
            states_preprocessed = torch.cat((temp , states_preprocessed), dim=0) #concatinate to finish tensor
        except:
            states_preprocessed = temp
            
    return states_preprocessed

In [17]:
import datetime
import math
print(">>>Training starts at ",datetime.datetime.now())

>>>Training starts at  2022-03-01 11:46:29.156064


Main training loop

In [18]:
env = make_env(DEFAULT_ENV_NAME)

net = DQN(env.observation_space.shape, env.action_space.n).to(device)
target_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME)
 
buffer = ExperienceReplay(replay_size)
agent = Agent(env, buffer)

epsilon = eps_start

optimizer = optim.Adam(net.parameters(), lr=learning_rate)
total_rewards = []
#*******Change************* That way every imprvement counts
for i in range(100):
    total_rewards.append(-21.000)

frame_idx = 0  

best_mean_reward = None

while True:
        frame_idx += 1
        epsilon = max(epsilon*eps_decay, eps_min)
        
        reward = agent.play_step(BVAE, net, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)

            mean_reward = np.mean(total_rewards[-10:]) #changed from 100 to have a quicker downwards trend as well           
            
            print("%d:  %d games, mean reward %.3f, (epsilon %.3f)" % (
                frame_idx, len(total_rewards), mean_reward, epsilon))
            
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)

            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), DEFAULT_ENV_NAME + "-best.dat")
                best_mean_reward = mean_reward
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f" % (best_mean_reward))

            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        if len(buffer) < replay_start_size:
            continue
        
        batch = buffer.sample(batch_size)
        states, actions, rewards, dones, next_states = batch
        
        #BVAE
        states_TOpreprocess = torch.tensor(states)
        next_states_TOpreprocess = torch.tensor(next_states)
        
        if BVAEtoDevice == True:
            states_TOpreprocess = states_TOpreprocess.to(device)
            next_states_TOpreprocess = next_states_TOpreprocess.to(device)
        else:
            states_TOpreprocess = states_TOpreprocess.to('cpu')
            next_states_TOpreprocess = next_states_TOpreprocess.to('cpu')
        
        
        states_preprocessed = preproBVAE(states_TOpreprocess)
        next_states_preprocessed = preproBVAE(next_states_TOpreprocess)
        
        if BVAEtoDevice == False:
            states_preprocessed = states_preprocessed.to(device)
            next_states_preprocessed = next_states_preprocessed.to(device)
        
        
        #print("states_preprocessed size: ")
        #print(states_preprocessed.size())
        
        states_v = states_preprocessed.view(states_preprocessed.size(0), -1) #oder batchsize [batchsize, 4* features]
        #print("states_v size: ")
        #print(states_v.size())
        next_states_v = next_states_preprocessed.view(next_states_preprocessed.size(0), -1)
            
        actions_v = torch.tensor(actions).to(device)
        rewards_v = torch.tensor(rewards).to(device)
        done_mask = torch.ByteTensor(dones).to(device)

        state_action_values = net(states_v).gather(1, actions_v.type(torch.int64).unsqueeze(-1)).squeeze(-1)
        #For Linux use: state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)

        next_state_values = target_net(next_states_v).max(1)[0]

        next_state_values[done_mask] = 0.0

        next_state_values = next_state_values.detach()

        expected_state_action_values = next_state_values * gamma + rewards_v

        loss_t = nn.MSELoss()(state_action_values, expected_state_action_values)
                
        optimizer.zero_grad()
        loss_t.backward()
        optimizer.step()
        if frame_idx % sync_target_frames == 0:
            target_net.load_state_dict(net.state_dict())
       
writer.close()

762:  101 games, mean reward -21.000, (epsilon 0.989)
Best mean reward updated -21.000
1680:  102 games, mean reward -20.900, (epsilon 0.975)
Best mean reward updated -20.900
2729:  103 games, mean reward -20.900, (epsilon 0.960)
3704:  104 games, mean reward -20.800, (epsilon 0.946)
Best mean reward updated -20.800
4673:  105 games, mean reward -20.800, (epsilon 0.932)
5503:  106 games, mean reward -20.800, (epsilon 0.921)
6581:  107 games, mean reward -20.600, (epsilon 0.906)
Best mean reward updated -20.600
7431:  108 games, mean reward -20.600, (epsilon 0.895)
8271:  109 games, mean reward -20.500, (epsilon 0.883)
Best mean reward updated -20.500
9052:  110 games, mean reward -20.500, (epsilon 0.873)
10020:  111 games, mean reward -20.500, (epsilon 0.860)


KeyboardInterrupt: 

In [None]:
print(">>>Training ends at ",datetime.datetime.now())

Performance

In [None]:
tensorboard  --logdir=runs

## Using the model

In [None]:
import gym
import time
import numpy as np

import torch

import collections

DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
FPS = 25

Tunning the image rendering in colab


In [None]:
# Taken from 
# https://towardsdatascience.com/rendering-openai-gym-envs-on-binder-and-google-colab-536f99391cc7

#!apt-get install -y xvfb x11-utils

#!pip install pyvirtualdisplay==0.2.* \
#             PyOpenGL==3.1.* \
#             PyOpenGL-accelerate==3.1.*

#!pip install gym[box2d]==0.17.*

import pyvirtualdisplay

_display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
_ = _display.start()

In [None]:
# Taken (partially) from 
# https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/03_dqn_play.py


model='PongNoFrameskip-v4-best.dat'
record_folder="video"  
visualize=True

env2 = make_env(DEFAULT_ENV_NAME)
if record_folder:
        env2 = gym.wrappers.Monitor(env2, record_folder, force=True)
net = DQN(env2.observation_space.shape, env2.action_space.n)
net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))

state = env2.reset()
total_reward = 0.0

while True:
        start_ts = time.time()
        if visualize:
            env2.render()
        state_v = torch.tensor(np.array([state], copy=False))
        q_vals = net(state_v).data.numpy()[0]
        action = np.argmax(q_vals)
        
        state, reward, done, _ = env2.step(action)
        total_reward += reward
        if done:
            break
        if visualize:
            delta = 1/FPS - (time.time() - start_ts)
            if delta > 0:
                time.sleep(delta)
print("Total reward: %.2f" % total_reward)

if record_folder:
        env2.close()