# DQN on Breakout

## Downloading ROMs


In [1]:
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar

!pip3 install atari-py
!python -m atari_py.import_roms ROMS
!pip install "gym[atari]" "gym[accept-rom-license]" atari_py
!pip install -U "ray[rllib]==1.6"

## Creating Environment

In [9]:
import gym
env_b = gym.make('Breakout-v0')

## Setting up Tune, Evaluation Function and Objective Function

In [4]:
from ray import tune
import ray
import ray.rllib.agents.dqn as dqn

def evaluation_fn(result):
    return result['episode_reward_mean']


def objective_fn(config):

    trainer = dqn.DQNTrainer(config=config)

    for i in range(100):
      # Perform one iteration of training the policy with DQN
      result = trainer.train()
      intermediate_score = evaluation_fn(result)

      # Feed the score back back to Tune.
      tune.report(iterations=i, mean_reward=intermediate_score)

## Config Setting

In [5]:
ray.init()
config = dqn.DEFAULT_CONFIG.copy()
# considering both dueling, double DQN and prioritised replay
config["dueling"] = tune.grid_search([True, False])
config["double_q"] = tune.grid_search([True, False])
config["prioritized_replay"] = tune.grid_search([True, False])
config["env"] = 'Breakout-v0'
config["model"] = { "fcnet_hiddens": [64],
                    "fcnet_activation": 'relu',
    }



## Running Tune on DQNs

In [2]:
analysis = tune.run(
        objective_fn, # train using objective function
        metric="mean_reward", # metric to optimise
        mode="max", # maximise the mean reward
        config=config)

In [8]:
#@title Grid Search Over DQNs
df = analysis.dataframe(metric="mean_reward", mode="max")
df[['config/prioritized_replay', 'config/double_q', 'config/dueling', 'mean_reward']]

Unnamed: 0,config/prioritized_replay,config/double_q,config/dueling,mean_reward
0,True,True,True,2.666667
1,True,False,True,5.36
2,True,True,False,5.68
3,True,False,False,6.62
4,False,True,True,1.91
5,False,False,True,3.62
6,False,True,False,1.83
7,False,False,False,2.37


## Tuning DQN with PER

In [3]:
from ray import tune
import ray
import ray.rllib.agents.dqn as dqn
import gym

# ray.init()
config_2 = dqn.DEFAULT_CONFIG.copy()
# considering dueling, double DQN and prioritised replay
config_2["dueling"] = False
config_2["double_q"] = False
config_2["prioritized_replay"] = True
config_2["env"] = 'Breakout-v0'
config_2["model"] = { "fcnet_hiddens": [64],
                    "fcnet_activation": 'relu',
    }
config_2["prioritized_replay_alpha"] = tune.grid_search([0.1, 0.6, 1])
config_2["prioritized_replay_beta"] = tune.grid_search([0, 0.4, 1])
# config_2["num_gpus"] = 1

#@title Running Tune
analysis_2 = tune.run(
        objective_fn, # train using objective function
        metric="mean_reward", # metric to optimise
        mode="max", # maximise the mean reward
        config=config_2)

In [7]:
df_2 = analysis_2.dataframe(metric="mean_reward", mode="max")
df_2[['config/prioritized_replay_alpha', 'config/prioritized_replay_beta', 'mean_reward']]

Unnamed: 0,config/prioritized_replay_alpha,config/prioritized_replay_beta,mean_reward
0,0.1,0.0,3.42
1,0.6,0.0,5.1
2,1.0,0.0,4.71
3,0.1,0.4,4.31
4,0.6,0.4,3.23
5,1.0,0.4,4.78
6,0.1,1.0,3.99
7,0.6,1.0,1.95
8,1.0,1.0,2.333333


## Training PPO

In [None]:
from ray import tune
import ray
import gym
import ray.rllib.agents.ppo as ppo

# ray.init()
config_ppo = ppo.DEFAULT_CONFIG.copy()
config_ppo["env"] = 'Breakout-v0'
trainer = ppo.PPOTrainer(config=config_ppo)

for i in range(100):
   # Perform one iteration of training the policy with PPO
   result = trainer.train()

# def objective_fn_ppo(config):

#     trainer = ppo.PPOTrainer(config=config)

#     for i in range(100):
#       # Perform one iteration of training the policy with DQN
#       result = trainer.train()
#       intermediate_score = evaluation_fn(result)

#       # Feed the score back back to Tune.
#       tune.report(iterations=i, mean_reward=intermediate_score)


# analysis_ppo = tune.run(
#         objective_fn_ppo, # train using objective function
#         metric="mean_reward", # metric to optimise
#         mode="max", # maximise the mean reward
#         config=config_ppo)

2022-04-20 18:26:13,554	INFO trainable.py:109 -- Trainable.setup took 19.640 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


agent_timesteps_total: 4000
custom_metrics: {}
date: 2022-04-20_18-38-03
done: false
episode_len_mean: 213.8
episode_media: {}
episode_reward_max: 3.0
episode_reward_mean: 0.8
episode_reward_min: 0.0
episodes_this_iter: 10
episodes_total: 10
experiment_id: abad9f50298f464fb0f55c0d705f1b89
hostname: d661aa9c8280
info:
  learner:
    default_policy:
      custom_metrics: {}
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.0017715163994580507
        entropy_coeff: 0.0
        kl: 0.008319293148815632
        model: {}
        policy_loss: -0.010277577675879002
        total_loss: 1.2135791778564453
        vf_explained_var: -0.21532458066940308
        vf_loss: 1.2221927642822266
  num_agent_steps_sampled: 4000
  num_agent_steps_trained: 4000
  num_steps_sampled: 4000
  num_steps_trained: 4000
iterations_since_restore: 1
node_ip: 172.28.0.2
num_healthy_workers: 2
off_policy_estimator: {}
perf:
  cpu_util_percent: 98.6