# PPO Breakout Example

### Atari Breakout

Please do note that this example may take a long time to train.

With the default 4 threads runnning on an 8-core CPU with a GTX 1080 Ti, it will take several hours to train to a decent level of play.

Running on a platform with more GPU power and a larger cluster of CPUs could siginificantly reduce training time.

Paper: https://arxiv.org/pdf/1705.05363.pdf

In [105]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.backend import categorical_crossentropy
from ludus.policies import BaseTrainer
from ludus.env import EnvController
from ludus.utils import preprocess_atari, reshape_train_var
from ludus.memory import MTMemoryBuffer
import gym
import multiprocessing as mp
# Super Mario stuff
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT

In [2]:
def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)
    return env

In [5]:
def filter_obs(obs):
    obs = cv2.resize(obs, obs_shape, interpolation=cv2.INTER_LINEAR)
    obs = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
    return obs / 255

In [None]:
def worker(make_env, max_steps=1000, act_repeat=6):
    env = make_env()
    obs = env.reset()
    obs = filter_obs(obs)
    
    ep_reward = 0
    for step in range(max_steps):
        act = np.random.randint(0, 7)
        
        step_reward = 0
        for i in range(act_repeat):
            obs_p, r, d, _ = env.step(act)
            step_reward += r
            if d:
                break
        ep_reward += step_reward
        
        train_data.append([obs_buffer.copy(), act, step_reward])

        obs_p = filter_obs(obs_p)
        obs_buffer[:,:,:-1] = obs_buffer[:,:,1:]
        obs_buffer[:,:,-1] = obs_p

        train_data[-1].append(obs_buffer.copy())

        if render:
            env.render()
            time.sleep(0.02)

        if len(train_data) >= steps_per_epoch:
            np.random.shuffle(train_data)
            train_obs = np.array([x[0] for x in train_data])
            train_acts = np.array([x[1] for x in train_data])
            train_rewards = np.array([x[2] for x in train_data])
            train_obs_ps = np.array([x[3] for x in train_data])

            nri, li, lf, lp, _, _, _ = sess.run([ri] + icm_losses + icm_objective,
                                    feed_dict={
                                        state_ph: train_obs,
                                        act_ph: train_acts,
                                        state_p_ph: train_obs_ps
                                    })
            ris += nri
            lis += li
            lfs += lf
            lps += lp
            train_iters += 1
            
            train_data = []

        if d:
            break

In [6]:
n_episodes = 1000000
steps_per_epoch = 2048
print_freq = 8
act_skip = 6
max_steps = int(4096 / act_skip)
render = False

all_rewards = []
train_iters = 0
train_data = [] # Formatted as [obs_buffer_t, act_t, reward_t, obs_buffer_t+1]
best_runs = []
for episode in range(n_episodes):
    obs = env.reset()
    obs = filter_obs(obs)
    # obs_buffer = np.rollaxis(np.array([obs] * obs_buffer_size), 0, 3)
    ep_reward = 0

    for step in range(max_steps):
        act = np.random.randint(0, 7)
        
        step_reward = 0
        for i in range(act_skip):
            obs_p, r, d, _ = env.step(act)
            step_reward += r
            if d:
                break
        ep_reward += step_reward
        
        train_data.append([obs_buffer.copy(), act, step_reward])

        obs_p = filter_obs(obs_p)
        obs_buffer[:,:,:-1] = obs_buffer[:,:,1:]
        obs_buffer[:,:,-1] = obs_p

        train_data[-1].append(obs_buffer.copy())

        if render:
            env.render()
            time.sleep(0.02)

        if len(train_data) >= steps_per_epoch:
            np.random.shuffle(train_data)
            train_obs = np.array([x[0] for x in train_data])
            train_acts = np.array([x[1] for x in train_data])
            train_rewards = np.array([x[2] for x in train_data])
            train_obs_ps = np.array([x[3] for x in train_data])

            nri, li, lf, lp, _, _, _ = sess.run([ri] + icm_losses + icm_objective,
                                    feed_dict={
                                        state_ph: train_obs,
                                        act_ph: train_acts,
                                        state_p_ph: train_obs_ps
                                    })
            ris += nri
            lis += li
            lfs += lf
            lps += lp
            train_iters += 1
            
            train_data = []

        if d:
            break
            
    if ep_reward >= 2300:
        best_runs.append(copy.deepcopy(train_data[-step-1:]))
        print(f'Run with {ep_reward} reward')

    all_rewards.append(ep_reward)

    if (episode + 1) % print_freq == 0:
        print(f'R_e: {np.mean(all_rewards[-print_freq:])}, R_i: {ris/train_iters}')
        print(f'L_i: {lis/train_iters}, L_f: {lfs/train_iters}, L_p: {lps/train_iters}')
        print()
        
        ris, lis, lfs, lps = 0, 0, 0, 0
        train_iters = 0

In [41]:
def target():
    for i in range(10000000):
        a = 5 * 5 / (2^10) + 34

In [60]:
timeit.timeit("""
def target():
    for i in range(10000000):
        a = 5 * 5 / (2^10) + 34

for i in range(8): 
    target()""", number=1)

1.9958672008942813

In [103]:
timeit.timeit("""
from multiprocessing import Process
def target():
    for i in range(int(10000000/3)):
        a = 5 * 5 / (2^10) + 34

ps = []
for i in range(24):
    ps.append(Process(target=target))
    ps[-1].start()

for i in range(len(ps)):
    ps[i].join()
""", number=1)

0.5992502459557727