#MS PACMAN with deep Q Learning



Imports and action list print after creation of the environment

In [None]:
!pip install git+https://github.com/Kojoley/atari-py.git

In [None]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from collections import deque, Counter
from skimage import color
from skimage import io
import IPython
from pathlib import Path
import imageio
import base64
from PIL import Image

#Create and initialize the environment
env = gym.make("Boxing-v0")
env.reset()
#Getting actions space and action meanings
actions_list = env.action_space
print(actions_list)
print(env.env.get_action_meanings())

#Testing observations
observation, _, _, _ = env.step(env.action_space.sample())
print("Showing a sample observation")
plt.imshow(observation)
plt.show()

print("Observation shape registered: {}".format(observation.shape))

Observation preprocessing

In [None]:
def rgb2gray(rgb):
  r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
  gray = 0.2989 * r + 0.5870 * g + 0.1140 * b

  return gray

#Reshape the observation image to reduce the input dimension of the network
def observation_preprocessing(observation):
  img = observation[20:180:2, 20:140:2]
  # img = (img - 128) / 128-1
  img = rgb2gray(img)
  return np.expand_dims(img.reshape(80,60), axis=2)

In [None]:
plt.imshow(observation_preprocessing(observation)[:,:,0])
plt.show()
print("Observation preprocessing shape: {}".format(observation_preprocessing(observation).shape))

Model initialization

Hyperparameters definition

In [None]:
num_episodes = 2000
maximum_steps = 3000
steps_train = 4
start_steps = 2000
window_dim = 4


eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000
avg_best_reward = -1000

discount_factor = 0.95

scores_file = 'scores_history.txt'

#Network parameters
batch_size = 32
optimizer = tf.keras.optimizers.Adam(lr=1e-6)
loss_fn = tf.keras.losses.mean_squared_error

Creation of the model

In [None]:
#Try to add padding='SAME'
def create_model(input_shape, output_shape):
  model = models.Sequential()
  model.add(layers.Conv2D(filters=16, kernel_size=(8,8), strides=4, padding='SAME', activation='relu'))
  model.add(layers.Conv2D(filters=32, kernel_size=(4,4), strides=2, padding='SAME', activation='relu'))
  model.add(layers.Flatten())
  model.add(layers.Dense(256))
  model.add(layers.Dense(output_shape))

  model.build(input_shape=input_shape)
  model.summary()
  return model

Building the model (training and target)

In [None]:
#Defining input shape and output shape of the model
input_shape = (None, 80, 60, 4)
output_shape = len(env.env.get_action_meanings())

#Let's build our Q-Networks
training_network = create_model(input_shape, output_shape)
target_network = create_model(input_shape, output_shape)

training_network.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy', 'loss'])

Self play functions initialization

In [None]:
#Self play parameters
replay_buffer_len = 10000

#Buffer is made from a deque — double ended queue
replay_buffer = deque(maxlen=replay_buffer_len)

In [None]:
#Epsilon greedy policy function definition
def epsilon_greedy_policy(state_array, model, epsilon=0):
    if np.random.rand() < epsilon:
      return np.random.randint(env.action_space.n)
    else:
      Q_values = model.predict(state_array)
      return np.argmax(Q_values)

#Greedy policy function definition
def select_greedy_action(state_array, model):
  Q_values = model.predict(state_array)
  return np.argmax(Q_values)

In [None]:
#Create the input tensor with 4 frame in succession
def create_input_tensor(x):
  x = np.array(x)
  return np.expand_dims(x.reshape((80,60,4)), axis=0)

In [None]:
#Let the agent play a single step
def play_one_step(env, state, epsilon, model, step, simulation=False):
  #Used for simulation, greedy action selection required
  if simulation is True:
    #Greedy action for simulation
    action = select_greedy_action(create_input_tensor(state), model)
  else:
    #Epsilon Greedy for training
    action = epsilon_greedy_policy(create_input_tensor(state), model, epsilon)
  next_obs, reward, done, info = env.step(action)
  next_state = state.copy()
  next_state.pop(0)
  next_state.append(observation_preprocessing(next_obs))
  state_tensor = create_input_tensor(state)
  next_state_tensor = create_input_tensor(next_state)
  replay_buffer.append((state_tensor, action, reward, next_state_tensor, done))
  return next_obs, reward, done, info

###Training

Sampling function to sample states among self-played games

In [None]:
#Sample a batch of states
def sample_states(batch_size):
  indices = np.random.randint(len(replay_buffer), size=batch_size)
  batch = [replay_buffer[index] for index in indices]
  states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch]) for field_index in range(5)]
  return states, actions, rewards, next_states, dones

In [None]:
#Single training step
def training_step(batch_size, episode, print_loss=False):
  #Sampling experiences
  experiences = sample_states(batch_size)
  states, actions, rewards, next_states, dones = experiences
  #Predicting next states Q values
  next_Q_values = training_network.predict(next_states)
  #print(next_Q_values)
  max_next_Q_values = np.max(next_Q_values, axis=1)
  target_Q_values = (rewards + (1-dones)*discount_factor*max_next_Q_values)
  mask = tf.one_hot(actions, output_shape)
  with tf.GradientTape() as tape:
      all_Q_values = training_network(states)
      Q_values = tf.reduce_sum(all_Q_values*mask, axis=1, keepdims=True)
      loss = tf.reduce_mean(loss_fn(target_Q_values,Q_values))
      if print_loss:
        print("Loss value: {}, in episode {}".format(loss, episode))
  grads = tape.gradient(loss, training_network.trainable_variables)
  optimizer.apply_gradients(zip(grads, training_network.trainable_variables))

Self play function

In [None]:
#Play full episode
def play_episode(env, model):
  #States array thinking of implementing a network which takes in input multiple states
  reward_accumulator = 0
  obs = env.reset()
  state = deque(maxlen=4)
  while len(state) < 4:
    state.append(observation_preprocessing(obs))

  done = False
  while not done:
    state_tensor = create_input_tensor(list(state))
    action = select_greedy_action(state_tensor, model)

    # Take action
    obs, reward, done, info = env.step(action)
    state.append(observation_preprocessing(obs))
    reward_accumulator += reward
    
  print("Total episode reward: {}".format(reward_accumulator))
  return state, reward, reward_accumulator

#Function to evaluate the current training model against the best performant model until now
def evaluate_model(model, env):
  num_episodes = 10
  reward_list = []
  #print("Playing with the last training's model")
  for ep in range(num_episodes):
  #  print("Starting simulation for episode {}".format(ep))
    _, _, reward = play_episode(env, model)
    reward_list.append(reward)
  
  return np.mean(np.array(reward_list)) 

#Compare new model's result with the last best model result
def model_compare(model1, environment, episode):
  print("Evaluating training model")
  training_model_value = evaluate_model(model1, env)
  global avg_best_reward
  with open(scores_file, "a") as scores_f:
    scores_f.write(f"\nepisode:{episode} average_score:{training_model_value}")
  if training_model_value > avg_best_reward:
    print("New best model found")
    avg_best_reward = training_model_value
    return True
  else:
    return False

In [None]:
#Check for pretrained model weights to import
weight_path_prefix = "training-weights"
weight_path_suffix = ".h5"
weight_path = weight_path_prefix + weight_path_suffix
global avg_best_reward
print("Checking for pretrained model weights")
if Path(weight_path).exists() and Path(weight_path).is_file():
  print("Existing weights file found, loading files on the training model")
  training_network.load_weights(weight_path)
  target_network.load_weights(weight_path)
  print("Successfully loaded weights")
  print("Computing current best model average reward")
  avg_best_reward = evaluate_model(training_network, env)
else:
  print("Pretrained model not found starting training from the beginning")
  print("Set current best model average reward to 0")
  avg_best_reward = -1000

Training

In [None]:
#Training the model
for episode in range(516, num_episodes):
    print("Episode: {} start".format(episode))
    obs = env.reset()
    state = deque(maxlen=4)
    while len(state) < 4:
      state.append(observation_preprocessing(obs))
    #Saving weight every 25 episodes
    if episode % 25 == 0 and episode >50:
      if model_compare(training_network, env, episode):
        weight_path = weight_path_prefix + str(episode) + weight_path_suffix
        training_network.save_weights(weight_path)
        target_network.load_weights(weight_path)
    
    for step in range(maximum_steps):
      # print(f"step {step} {np.array(state).shape}")
      epsilon = max(1- episode/10000, 0.01)
      obs, reward, done, info = play_one_step(env, list(state), epsilon, training_network, step)
      state.append(observation_preprocessing(obs))
      if done:
          break
      if episode > 50:
       if step % 2 ==0:
        if step % 200 == 0:
          training_step(batch_size, episode, print_loss=True)
        else:
          training_step(batch_size, episode)

In [None]:
#Generate self play video to evaluate the performances of the system
def create_video(env, model, video_filename = 'imageio'):
  def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
      <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return IPython.display.HTML(tag)


  num_episodes = 5
  print("Setting video name")
  video_filename = video_filename + ".mp4"
  with imageio.get_writer(video_filename, fps=60) as video:
    for ep in range(num_episodes):
        print("Starting simulation for episode {}".format(ep))
        state = deque(maxlen=4)
        obs = env.reset()
        while len(state) < 4:
          state.append(observation_preprocessing(obs))

        terminated = False
        while not terminated:
            #agent_states = np.expand_dims(np.array(states).reshape((80,80,4)), axis=0)
            action = select_greedy_action(create_input_tensor(state), model)

            # Take action
            obs, _, terminated, _ = env.step(action)
            state.append(observation_preprocessing(obs))
            video.append_data(obs)
  print("Simulation completed video is ready to be downloaded")
  embed_mp4(video_filename)

In [None]:
#Simulation
#training_network.load_weights(weight_path)
create_video(env, training_network)