In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
import gym
from ludus.policies import PPOTrainer
from ludus.env import EnvController
from ludus.utils import preprocess_atari
from ludus.memory import MTMemoryBuffer
import copy
# Super Mario stuff
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

In [2]:
def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT)
    return env

In [3]:
env = make_env() # This instance of the environment is only used
                              # to get action dimensions
in_shape = [84, 84, 3] # Size of reshaped observations

# Creating a conv net for the policy and value estimator
obs_op = Input(shape=in_shape)
conv1 = Conv2D(16, 8, (4, 4), activation='relu')(obs_op)
max_pool1 = MaxPool2D(2, 2)(conv1)
conv2 = Conv2D(32, 4, (2, 2), activation='relu')(max_pool1)
max_pool2 = MaxPool2D(2, 2)(conv2)
dense1 = Dense(256, activation='relu')(max_pool2)
flattened = Flatten()(dense1)

# Output probability distribution over possible actions
act_probs_op = Dense(env.action_space.n, activation='softmax')(flattened)

# Output value of observed state
value_op = Dense(1)(flattened)

# Wrap a Proximal Policy Optimization Trainer on top of the network
network = PPOTrainer(obs_op, act_probs_op, value_op, act_type='discrete', ppo_iters=80)

  result = entry_point.load(False)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [4]:
n_episodes = 10000 # Total episodes of data to collect
max_steps = 2048 # Max number of frames per game
batch_size = 4 # Smaller = faster, larger = stabler
print_freq = 1 # How many training updates between printing progress

In [5]:
agent_hist = {} # Keeps track of up to 3 previous frames for each agent

# Create observation transformation that adds the two last frames on
# as two extra dimensions
def new_obs_transform(obs, agent_id):
    new_frame = preprocess_atari(obs.squeeze()) # First preprocess the new frame
    
    if agent_id in agent_hist: # Case for a continued episode
        agent_hist[agent_id] = agent_hist[agent_id][1:]
        agent_hist[agent_id].append(new_frame)
    else: # Case for a new episode
        agent_hist[agent_id] = [new_frame, new_frame, new_frame]
    
    # Format the data
    arr = np.array(agent_hist[agent_id])
    return np.swapaxes(arr, 0, 3).squeeze()

############################################################
############################################################

mtmb = MTMemoryBuffer() # Create a memory buffer to store the episode data

# Edit the memory buffer's start_rollout function so that every time
# an episode ends, it resets the respective agent's history
old_start_rollout = mtmb.start_rollout

def new_start_rollout(agent_id):
    old_start_rollout(agent_id)
    agent_hist.pop(agent_id, None)
    
mtmb.start_rollout = new_start_rollout

In [6]:
# Create the environment controller for generating game data
ec = EnvController(make_env, n_threads=4, memory_buffer=mtmb)
# Set the preprocessing function for observations
ec.set_obs_transform(new_obs_transform)

In [7]:
update_rewards = []

for i in range(int(n_episodes / batch_size)):
    ec.sim_episodes(network, batch_size, max_steps) # Simualate env to generate data
    update_rewards.append(ec.get_avg_reward()) # Append rewards to reward tracker list
    
    for j in range(len(ec.mb.rollouts)):
        print(sum(np.array(ec.mb.rollouts[j])[:,2]))
    
    dat = ec.get_data() # Get all the data gathered
    network.train(dat) # Train the network with PPO
    if i != 0 and i % print_freq == 0:
        print(f'Update #{i}, Avg Reward: {np.mean(update_rewards[-print_freq:])}') # Print an update

452
452
452
443
704
443
574
446
Update #1, Avg Reward: 541.75
452
1354
452
446
Update #2, Avg Reward: 676.0
1113
823
1079
1490
Update #3, Avg Reward: 1126.25
574
580
452
953
Update #4, Avg Reward: 639.75
1242
443
580
574
Update #5, Avg Reward: 709.75
446
452
580
698
Update #6, Avg Reward: 544.0
580
580
574
580
Update #7, Avg Reward: 578.5
580
580
580
820
Update #8, Avg Reward: 640.0
574
574
574
571
Update #9, Avg Reward: 573.25
574
574
580
1064
Update #10, Avg Reward: 698.0
452
580
827
580
Update #11, Avg Reward: 609.75
580
580
580
580
Update #12, Avg Reward: 580.0
832
1114
941
1318
Update #13, Avg Reward: 1051.25
580
621
580
580
Update #14, Avg Reward: 590.25
0
-32
-40
-4
Update #15, Avg Reward: -19.0
81
200
165
134
Update #16, Avg Reward: 145.0
580
452
907
580
Update #17, Avg Reward: 629.75
1304
1013
580
947
Update #18, Avg Reward: 961.0
1426
589
966
1276
Update #19, Avg Reward: 1064.25
456
624
579
580
Update #20, Avg Reward: 559.75
166
136
184
182
Update #21, Avg Reward: 167.0
580
1

198
211
338
110
Update #177, Avg Reward: 214.25
364
495
435
333
Update #178, Avg Reward: 406.75
679
964
580
310
Update #179, Avg Reward: 633.25
329
343
623
514
Update #180, Avg Reward: 452.25
562
803
775
580
Update #181, Avg Reward: 680.0
1401
1050
1303
1293
Update #182, Avg Reward: 1261.75
-8
-16
11
-30
Update #183, Avg Reward: -10.75
-97
-101
-101
-95
Update #184, Avg Reward: -98.5
-102
-102
-102
-101
Update #185, Avg Reward: -101.75
-102
-102
-102
-102
Update #186, Avg Reward: -102.0
-83
-84
-81
-71
Update #187, Avg Reward: -79.75
-39
-48
-69
-79
Update #188, Avg Reward: -58.75
-53
-65
-35
-38
Update #189, Avg Reward: -47.75
-39
-47
-39
-17
Update #190, Avg Reward: -35.5
-47
-43
-39
-10
Update #191, Avg Reward: -34.75
44
30
3
8
Update #192, Avg Reward: 21.25
14
7
24
-9
Update #193, Avg Reward: 9.0
108
216
102
207
Update #194, Avg Reward: 158.25
136
170
451
215
Update #195, Avg Reward: 243.0
205
121
272
198
Update #196, Avg Reward: 199.0
288
135
472
311
Update #197, Avg Reward: 301.5

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-e1806c511795>", line 11, in <module>
    network.train(dat) # Train the network with PPO
  File "/home/ejmejm/MLProjects/ludus/ludus/policies.py", line 271, in update_func
    self.advatange_holders: self.old_advantages})
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run
    run_metadata_ptr)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1121, in _run
    np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
  File "/home/ejmejm/anaconda3/lib/python3.6/site-packages/numpy/core/numeric.py", line 492, in asarray
    return array(a, dtype, copy=False, order=order)
KeyboardInterrupt

During handling of the above exception, another exception occu

KeyboardInterrupt: 

In [None]:
ec.render_episodes(network, 5, max_steps) # Render an episode to see the result