# PPO Breakout Example

### Atari Breakout

Please do note that this example may take a long time to train.

With the default 4 threads runnning on an 8-core CPU with a GTX 1080 Ti, it will take several hours to train to a decent level of play.

Running on a platform with more GPU power and a larger cluster of CPUs could siginificantly reduce training time.

Paper: https://arxiv.org/pdf/1705.05363.pdf

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.backend import categorical_crossentropy
from ludus.policies import BaseTrainer
from ludus.env import EnvController
from ludus.utils import preprocess_atari, reshape_train_var, discount_rewards
from ludus.memory import MTMemoryBuffer
import gym
import time
from mpi4py import MPI
import cv2
import heapq
import multiprocessing as mp
# Super Mario stuff
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT

In [2]:
def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)
    return env

In [3]:
def filter_obs(obs, obs_shape=(42, 42)):
    obs = cv2.resize(obs, obs_shape, interpolation=cv2.INTER_LINEAR)
    obs = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
    return obs / 255

In [10]:
def worker(action_sets, max_steps=1000):
    train_data = []
    env = make_env()
    obs = env.reset()
    obs = filter_obs(obs)
    
    ep_reward = 0
    step = 0
    while step < max_steps:
        act_idx = np.random.randint(len(action_sets))
        act_set = action_sets[act_idx]
        
        step_reward = 0
        for act in act_set:
            obs_p, r, d, _ = env.step(act)
            step_reward += r
            step += 1
            if d or step >= max_steps:
                break
        ep_reward += step_reward
        
        train_data.append([obs, act_set, step_reward])
        
        obs_p = filter_obs(obs_p)
        train_data[-1].append(obs_p)
        obs = obs_p
        
        if d:
            break
    
    train_data = np.array(train_data)
    return train_data

In [11]:
train_act_sets = [[i] for i in range(0, 7)]

In [69]:
training_data = [worker(train_act_sets, 50) for _ in range(100)]

In [70]:
reward_list = []
for i in range(len(training_data)):
    reward_list.append(sum(training_data[i][:,2]))
print(f'Avg Reward: {np.mean(reward_list)}, Min: {np.min(reward_list)}, Max: {np.max(reward_list)}, Std: {np.std(reward_list)}')

Avg Reward: 52.93, Min: 19, Max: 93, Std: 10.108664600232812


In [71]:
max_reward = max(reward_list)
scaled_rewards = [r - max_reward for r in reward_list]
reward_sum = sum(scaled_rewards)
scaled_rewards = [r / reward_sum for r in scaled_rewards]

selected_ids = np.random.choice(range(len(training_data)), size=10, replace=False, p=scaled_rewards)
top_data = [training_data[idx] for idx in selected_ids]

In [72]:
min_branch, max_branch = 2, 3

strain_act_sets = set([tuple(x) for x in train_act_sets])
branch_dicts = {}
for seq_len in range(min_branch, max_branch+1): # For each sequence length
    count_dict = {}
    for episode in top_data: # Each chosen episode
        ep_acts = episode[:,1]
        for step_idx in range(seq_len-1, len(ep_acts)):
            new_act_set = tuple(np.concatenate(ep_acts[step_idx-seq_len+1:step_idx+1]))
            if tuple(new_act_set) not in strain_act_sets:
                if new_act_set in count_dict:
                    count_dict[new_act_set] += 1
                else:
                    count_dict[new_act_set] = 1
            
    branch_dicts[seq_len] = count_dict

In [73]:
act_top_x = 2
top_acts = []
for n_branch in range(min_branch, max_branch+1):
#     count_list = [c[1] for c in branch_dicts[n_branch].items()]
#     max_count = max(count_list)
#     scaled_counts = [c - max_reward for r in reward_list]
#     reward_sum = sum(scaled_rewards)
#     scaled_rewards = [r / reward_sum for r in scaled_rewards]

#     selected_ids = np.random.choice(range(len(training_data)), size=10, replace=False, p=scaled_rewards)
#     top_data = [training_data[idx] for idx in selected_ids]
    
    top_acts.extend([list(x[0]) for x in heapq.nlargest(act_top_x, list(branch_dicts[n_branch].items()), key=lambda x: x[1])])
    
top_acts

[[3, 4, 1, 1, 3], [1, 3, 0], [3, 4, 1, 1, 3, 0], [1, 3, 0, 1, 3]]

In [76]:
list(branch_dicts[n_branch].items())

[((3, 4, 1, 1, 3, 0), 1),
 ((1, 3, 0, 1, 3), 1),
 ((0, 1, 3, 4), 1),
 ((1, 3, 4, 1), 1),
 ((4, 1, 0), 1),
 ((1, 0, 3, 0, 6), 1),
 ((0, 3, 0, 6, 0), 1),
 ((3, 0, 6, 0, 0, 3, 4), 1),
 ((0, 0, 3, 4, 4), 1),
 ((0, 3, 4, 4, 5), 1),
 ((4, 5, 3, 3, 0), 1),
 ((5, 3, 3, 0, 2), 1),
 ((3, 3, 0, 2, 3, 3, 0), 1),
 ((2, 3, 3, 0, 3, 4, 1), 1),
 ((3, 3, 0, 3, 4, 1, 3, 3, 0), 1),
 ((3, 4, 1, 3, 3, 0, 5), 1),
 ((3, 3, 0, 5, 5), 1),
 ((5, 5, 6), 1),
 ((5, 6, 1, 3), 1),
 ((6, 1, 3, 3, 4, 1), 1),
 ((1, 3, 3, 4, 1, 3, 4, 1), 1),
 ((3, 4, 1, 3, 4, 1, 0, 3, 4), 1),
 ((3, 4, 1, 0, 3, 4, 3), 1),
 ((0, 3, 4, 3, 1), 1),
 ((3, 1, 3, 3), 1)]

In [74]:
for act in top_acts:
    train_act_sets.append(act)

In [34]:
def render_episode(action_set, max_steps=50):
    env = make_env()
    obs = env.reset()
    obs = filter_obs(obs)
    
    ep_reward = 0
    for step in range(max_steps):
        step_reward = 0
        for act in action_set:
            env.render()
            time.sleep(0.02)
            obs_p, r, d, _ = env.step(act)
            step_reward += r
            if d:
                break
        ep_reward += step_reward
        
        obs_p = filter_obs(obs_p)
        obs = obs_p
        
        if d:
            break
            
    print(step, ep_reward)

In [36]:
render_episode(a[62])

0 1525
