# SpaceInvaders-Atari2600

## Import the dependencies

In [1]:
import tensorflow as tf     # Deep Learning library
import numpy as np          # Linear algebra library
import retro                # Environment library

from skimage import transform        # Help us to preprocess the frames
from skimage.color import rgb2gray   # Help us to grayscale our frames

import matplotlib.pyplot as plt      # Display graphs
%matplotlib inline
from collections import deque        # Ordered collections with ends

import random                        # Random numbers

import warnings                      # This ignore all the warning messages that are normally printed
                                     # during the training because of skimage

warnings.filterwarnings('ignore')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Import functions from source

In [2]:
from src.frame_preparation import preprocess_frame, stack_frames
from src.exp_exp_tradeoff import predict_action
from src.environment import create_environment, game_commands
from src.change_action import action_to_command

## Set up our hyperparameters

In [3]:
# Model Hyperparameters
state_size = [110, 84, 4]         # Our input is a stack of 4 frames hence 110x84x4 (Width, height, channels) 
action_size = 3                   # 3 possible actions
learning_rate =  0.00025          # Alpha (aka learning rate)

# Training hyperparameters
total_episodes = 20               # Total episodes for training
max_steps = 50000                 # Max possible steps in an episode
batch_size = 3000                 # Batch size

# Exploration parameters for epsilon greedy strategy
explore_start = 1                 # Exploration probability at start
explore_stop = 0.01               # Minimum exploration probability 
decay_rate = 0.00001              # Exponential decay rate for exploration prob

# Q learning hyperparameters
gamma_start = 1                   # Discounting rate
gamma_decay_rate = 0.000000002
# Memory hyperparameters
pretrain_length = batch_size      # Number of experiences stored in the Memory when initialized for the first time
memory_size = 25600               # Number of experiences the Memory can keep

# Preprocessing hyperparameters
stack_size = 4                    # Number of frames stacked

# MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

# TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = False

# Do you want to use commands ?
use_commands = True

## Create an environment

In [4]:
env, action_space, possible_actions, commands = create_environment(retro, 'SpaceInvaders-Atari2600', 3,
                                                                   use_commands, game_commands)

The size of our frame is:  Box(210, 160, 3)

[[1 0 0]
 [0 1 0]
 [0 0 1]]


## Import our Deep Q-learning Neural Network model

This is our Deep Q-learning model:
* We take a stack of 4 frames as input
* It passes through 3 convnets
* Then it is flattened
* Finally it passes through 2 FC layers
* It outputs a Q value for each action

In [5]:
from src.agent import DQNetwork

# Reset the graph
tf.reset_default_graph()

# Instantiate the DeepQNetwork
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



## Create the experience replay

In [6]:
from src.experience_replay import Memory
stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for i in range (4)], maxlen=4)

# Create the experience replay object
memory = Memory(memory_size)

# Instantiate memory with random tuples
memory.instantiate_memory(env, possible_actions, (8, -12, 4, -12), action_size,
                          commands, stacked_frames, pretrain_length, use_commands)

# Set up TensorBoard

In [7]:
loss = tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

# Train our Agent

In [None]:
saver = tf.train.Saver()
if training == True:
    with tf.Session() as sess:
        
        # Setup TensorBoard Writer
        writer = tf.summary.FileWriter("./graphs", sess.graph)
        
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        
        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0
        
        rewards_list = []
        
        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            state = env.reset()
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True, (8, -12, 4, -12))
            
            while step < max_steps:
                step += 1
                
                # Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability, gamma = predict_action(sess, DQNetwork, explore_start, explore_stop, decay_rate, decay_step,
                                                                    state, possible_actions, gamma_start, gamma_decay_rate, step)
                
                # Change from an action to command, of course if you need
                if use_commands:
                    command = action_to_command(action, action_size, commands)
                else:
                    command = action
                
                # Perform the action and get the next_state, reward, and done information
                next_state, reward, done, _ = env.step(command)
                
                if episode_render:
                    env.render()
                
                # Add the reward to total reward
                episode_rewards.append(reward)
                # If the game is finished
                if done:
                    # The episode ends so no next state
                    next_state = np.zeros((110,84), dtype=np.int)
                    
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, (8, -12, 4, -12))

                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode),
                          'Total reward: {}'.format(total_reward),
                          'Explore P: {:.4f}'.format(explore_probability),
                          'Training Loss {:.4f}'.format(loss),
                          'Gamma: {}'.format(gamma))

                    rewards_list.append((episode, total_reward))

                    # Store transition <st,at,rt+1,st+1> in memory D
                    memory.add((state, action, reward, next_state, done))

                else:
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, (8, -12, 4, -12))
                
                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))

                    # st+1 is now our current state
                    state = next_state
                    
                    
            # LEARNING PART            
            # Obtain random batch from memory
            batch = memory.sample(step)
                    
            states_mb = np.array([each[0] for each in batch], ndmin=3)
            actions_mb = np.array([each[1] for each in batch])
            rewards_mb = np.array([each[2] for each in batch]) 
            next_states_mb = np.array([each[3] for each in batch], ndmin=3)
            dones_mb = np.array([each[4] for each in batch])

            target_Qs_batch = []

            # Get Q values for next_state 
            Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
            # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
            for i in range(0, len(batch)):
                terminal = dones_mb[i]

                # If we are in a terminal state, only equals reward
                if terminal:
                    target_Qs_batch.append(rewards_mb[i])
                        
                else:
                    target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                    target_Qs_batch.append(target)
                        

            targets_mb = np.array([each for each in target_Qs_batch])
                
            loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                    feed_dict={DQNetwork.inputs_: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork.actions_: actions_mb})

            # Clear our memory
            memory.remove_all()
            
                # Write TF Summaries
                #summary = sess.run(write_op)
                #writer.add_summary(summary, episode)
                #writer.flush()
                    
            # Save model
            save_path = saver.save(sess, "./space_models/model_" + str(episode) + "/model.ckpt")
            print("Model Saved")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Model Saved


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Model Saved


## Test and Watch our Agent play

In [11]:
saver = tf.train.Saver()
with tf.Session() as sess:
    total_test_rewards = []
 
    # Load the model
    saver.restore(sess, "./space_models/model_8/model.ckpt")
    
    
    for episode in range(5):
        total_rewards = 0
        
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True, (8, -12, 4, -12))
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        while True:
            # Reshape the state
            state = state.reshape((1, *state_size))
            # Get action from Q-network 
            # Estimate the Qs values state
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            
            # Change from an action to command
            command = action_to_command(action, action_size, commands)
            
            #Perform the action and get the next_state, reward, and done information
            next_state, reward, done, _ = env.step(command)
            env.render()
            
            total_rewards += reward

            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                
                
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, (8, -12, 4, -12))
            state = next_state
            
env.close()

INFO:tensorflow:Restoring parameters from ./space_models/model_8/model.ckpt


INFO - Restoring parameters from ./space_models/model_8/model.ckpt


****************************************************
EPISODE  0
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]


[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0

[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 1 0]
[0 0 0 0 0

KeyboardInterrupt: 