The problem statement:
https://gym.openai.com/envs/MsPacman-v0

In [1]:
%load_ext autoreload
%autoreload 2

import gym
#!pip3 install box2d
import random
import torch
from torchvision import transforms

import numpy as np
from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

from dqn_agent_Atari_3D_prioritized import Agent

!python -m pip install pyvirtualdisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
env = gym.make('MsPacman-v0')
#env = gym.make('SpaceInvaders-v0')
env.seed(0)
state_space = env.observation_space.shape
action_space = env.action_space.n
input_shape = (84,84)
states_stack_depth = 4
print('State shape: ', state_space)
print('Number of actions: ', action_space)

State shape:  (210, 160, 3)
Number of actions:  9


In [3]:
# For inputs basic pre-processing
encode_states = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize(input_shape),
    transforms.ToTensor()
    ]
)

In [4]:
def state2StackedInputs(states_stack, current_state, last_state):
    encoded_state = encode_states(current_state) #210x160x3 -> 1x84x84
    
    #combine the max element between this frame and last frame  
    last_encoded_state = encode_states(last_state)
    
    merged_next_state = torch.stack((encoded_state.squeeze(), last_encoded_state.squeeze()), dim=0).max(dim=0)[0]
    states_stack.append(merged_next_state) #last in the stack and first is out hence
    
    #states_stack.append(encoded_state.squeeze()) #last in the stack and first is out hence

    #convert to tensor for input
    state_inputs = torch.stack([states_stack[i] for i in range(len(states_stack))]).numpy()
    return state_inputs

In [5]:
"""
untrained_agent = Agent(state_size=states_stack_depth, action_size=action_space, seed=10)

# watch an untrained agent
states_stack = deque([torch.zeros(input_shape) for z in range(states_stack_depth)], maxlen=states_stack_depth)

state = env.reset()
state_inputs = state2StackedInputs(states_stack, state, np.zeros(state.shape, dtype=np.uint8))
last_state = state

img = plt.imshow(env.render(mode='rgb_array'))
for j in range(1000):
    action = untrained_agent.act(state_inputs)
    img.set_data(env.render(mode='rgb_array')) 
    plt.axis('off')
    display.display(plt.gcf())
    display.clear_output(wait=True)
    next_state, reward, done, _ = env.step(action)
    
    next_state_inputs = state2StackedInputs(states_stack, next_state, last_state)
    state_inputs = next_state_inputs
    last_state = next_state
    
    if done:
        break 
        
env.close()
"""

"\nuntrained_agent = Agent(state_size=states_stack_depth, action_size=action_space, seed=10)\n\n# watch an untrained agent\nstates_stack = deque([torch.zeros(input_shape) for z in range(states_stack_depth)], maxlen=states_stack_depth)\n\nstate = env.reset()\nstate_inputs = state2StackedInputs(states_stack, state, np.zeros(state.shape, dtype=np.uint8))\nlast_state = state\n\nimg = plt.imshow(env.render(mode='rgb_array'))\nfor j in range(1000):\n    action = untrained_agent.act(state_inputs)\n    img.set_data(env.render(mode='rgb_array')) \n    plt.axis('off')\n    display.display(plt.gcf())\n    display.clear_output(wait=True)\n    next_state, reward, done, _ = env.step(action)\n    \n    next_state_inputs = state2StackedInputs(states_stack, next_state, last_state)\n    state_inputs = next_state_inputs\n    last_state = next_state\n    \n    if done:\n        break \n        \nenv.close()\n"

In [6]:
def loadTrainedModel(agent, path):
    state_dicts = torch.load(path)
    print(state_dicts['model'].keys())

    agent.qnetwork_local.load_state_dict(state_dicts['model'])
    agent.optimizer.load_state_dict(state_dicts['optimizer'])

In [7]:
def saveTrainedModel(agent, path):
    state_dicts = {'model': agent.qnetwork_local.state_dict(), 'optimizer': agent.optimizer.state_dict()}
    torch.save(state_dicts, path)

In [8]:
agent = Agent(state_size=states_stack_depth, action_size=action_space, seed=510)

current device: cpu


In [9]:
"""
model_dir = 'saved_models/'
model_name = 'SpaceInvaders-3D-all-dicts-AWS.pt'

loadTrainedModel(agent, model_dir+model_name)
"""

"\nmodel_dir = 'saved_models/'\nmodel_name = 'SpaceInvaders-3D-all-dicts-AWS.pt'\n\nloadTrainedModel(agent, model_dir+model_name)\n"

In [12]:
model_dir = 'saved_models/'
model_name = 'SpaceInvaders-3D-all-dicts-AWS.pt'

def dqn_3D(n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.1, eps_decay=0.9995, skipframe_max=4):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    frame_counter = 0                  # keep track of number of frames seen
    
    eps = eps_start                    # initialize epsilon 
    
    for i_episode in range(1, n_episodes+1):
        
        state = env.reset()
        
        #stack to keep track of most recent states
        states_stack = deque([torch.zeros(input_shape) for z in range(states_stack_depth)], maxlen=states_stack_depth)
        
        state_inputs = state2StackedInputs(states_stack, state, np.zeros(state.shape, dtype=np.uint8))
        #state_inputs = state2StackedInputs(states_stack, state)
        
        last_state = state

        score = 0
        skip_counter = 0
        for t in range(max_t):

            action = agent.act(state_inputs, eps) #for action recommendation
            
            next_state, reward, done, _ = env.step(action)
            frame_counter += 1

            next_state_inputs = state2StackedInputs(states_stack, next_state, last_state)

            
            if skip_counter >= skipframe_max:
                agent.step(state_inputs , action, reward, next_state_inputs, done)
                skip_counter = 0
            else:
                skip_counter += 1

            state_inputs = next_state_inputs
            last_state = next_state
            
            score += reward
            if done and skip_counter==0:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        
        print('\rEpisode {}\tAverage Score: {:.2f}\t# of frames: {}'.format(i_episode, 
                                                                            np.mean(scores_window),
                                                                            frame_counter
                                                                           ), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\teps: {:.2f}\t# of frames: {}'.format(i_episode, 
                                                                                             np.mean(scores_window), 
                                                                                             eps,
                                                                                             frame_counter))
            saveTrainedModel(agent, model_dir+model_name)
        
        if np.mean(scores_window)>=1000.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100,
                                                                                         np.mean(scores_window)))
            
            saveTrainedModel(agent, model_dir+model_name)
            break
        
    return scores

scores = dqn_3D(200)

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 1	Average Score: 210.00	# of frames: 685

KeyboardInterrupt: 

[array([[0.00063199]], dtype=float32),
 array([[0.00785875]], dtype=float32),
 array([[0.02915528]], dtype=float32),
 array([[0.03270942]], dtype=float32),
 array([[0.05577008]], dtype=float32),
 array([[0.00034221]], dtype=float32),
 array([[0.00782581]], dtype=float32),
 array([[0.00044461]], dtype=float32),
 array([[0.01641655]], dtype=float32),
 array([[0.05557887]], dtype=float32),
 array([[0.02947921]], dtype=float32),
 array([[0.03255444]], dtype=float32),
 array([[0.00810209]], dtype=float32),
 array([[0.01653468]], dtype=float32),
 array([[0.00040368]], dtype=float32),
 array([[0.04588009]], dtype=float32),
 array([[0.05575639]], dtype=float32),
 array([[0.02987044]], dtype=float32),
 array([[0.01870964]], dtype=float32),
 array([[9.927164e-05]], dtype=float32),
 array([[0.00745537]], dtype=float32),
 array([[0.0336853]], dtype=float32),
 array([[0.04558716]], dtype=float32),
 array([[5.765269e-05]], dtype=float32),
 array([[0.01890237]], dtype=float32),
 array([[0.05596461]],

In [14]:
saveTrainedModel(agent, model_dir+model_name)

In [None]:
# load the weights from file
#torch.load(agent.qnetwork_local.state_dict(), model_dir+model_name)
agent.qnetwork_local.eval()
agent.qnetwork_target.eval()

for i in range(3):
    # watch an untrained agent
    states_stack = deque([torch.zeros(input_shape) for z in range(states_stack_depth)], maxlen=states_stack_depth)

    state = env.reset()
    state_inputs = state2StackedInputs(states_stack, state)
    last_state = state
    
    img = plt.imshow(env.render(mode='rgb_array'))
    for j in range(1000):
        action = agent.act(state_inputs)
        img.set_data(env.render(mode='rgb_array')) 
        plt.axis('off')
        display.display(plt.gcf())
        display.clear_output(wait=True)
        next_state, reward, done, _ = env.step(action)
        
        next_state_inputs = state2StackedInputs(states_stack, next_state, last_state)
        state_inputs = next_state_inputs
        last_state = next_state
        
        if done:
            break 

agent.qnetwork_local.train()
agent.qnetwork_target.train()
            
env.close()

In [13]:
torch.cuda.is_available()

False