# Proposal (Draft)
I will use reinforcement learning to train an artificial intelligence agent to play the Atari 2600 game, *Pitfall!*. Reinforcement learning is a domain of machine learning where an agent takes actions based on observations in their environment to maximize their reward. The project falls under the passive reinforcement learning realm in which a Q-learning agent is trained with an action-utility function (Q-function) to learn the transition model that connects constrained utility states. Learning the transition model will assist the agent in decision making in order to take proper actions to maximize their score in *Pitfall!*.

The technology used will be OpenAI’s Gym Python package which has an Atari 2600 emulator to run *Pitfall!*. The Gym version of the game makes it easy to interface with modern machine learning frameworks, such as Keras, which I will be using to train my artificial intelligence agent. This artificial agent will be a deep neural network trained using reinforcement learning algorithms. The data used to train the agent will be an array representation of the current screen state where each pixel has an RGB value, with the shape of the array being (210, 160, 3). For each state, there is an integer value that reinforces each action, with positive integers being positive reinforcement.

TODO: Make a note that this is for honors.
TODO: See the project guidelines. Need a rough draft of all the sections.
TODO: Put the versions of the packages.

Can get the same to be setup and then run.
TODO: Try with 'PitfallDeterministic-v4'

In [None]:
import gym
env = gym.make('Pitfall-v0')
env.reset()
for _ in range(5000):
    env.render()
    env.step(env.action_space.sample())  # take a random action
env.close()


In [3]:
import gym
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras import layers

In [None]:
env = gym.make('BreakoutDeterministic-v4')
is_done = False
env.reset()
while not is_done:
    env.render()
    frame, reward, is_done, _ = env.step(env.action_space.sample())

env.close()

In [2]:
# Preprocessing
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))

def transform_reward(r):
        return np.sign(r)

In [None]:
def fit_batch(model, gamma, start_states, actions, rewards, next_states, is_terminal):
    """Do one deep Q learning iteration.

    Params:
    - model: The DQN
    - gamma: Discount factor (should be 0.99)
    - start_states: numpy array of starting states
    - actions: numpy array of one-hot encoded actions corresponding to the start states
    - rewards: numpy array of rewards corresponding to the start states and actions
    - next_states: numpy array of the resulting states corresponding to the start states and actions
    - is_terminal: numpy boolean array of whether the resulting state is terminal

    """
    # First, predict the Q values of the next states. Note how we are passing ones as the mask.
    next_q_values = model.predict([next_states, np.ones(actions.shape)])
    # The Q values of the terminal states is 0 by definition, so override them
    next_q_values[is_terminal] = 0
    # The Q values of each start state is the reward + gamma * the max next state Q value
    q_values = rewards + gamma * np.max(next_q_values, axis=1)
    # Fit the keras model. Note how we are passing the actions as the mask and multiplying
    # the targets by the actions.
    model.fit(
        [start_states, actions], actions * q_values[:, None],
        nb_epoch=1, batch_size=len(start_states), verbose=0
    )

In [None]:
def atari_model(n_actions):
    # We assume a theano backend here, so the "channels" are first.
    ATARI_SHAPE = (4, 105, 80)

    # With the functional API we need to define the inputs.
    frames_input = keras.layers.Input(ATARI_SHAPE, name='frames')
    actions_input = keras.layers.Input((n_actions,), name='mask')

    # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1].
    normalized = keras.layers.Lambda(lambda x: x / 255.0)(frames_input)

    # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity."
    conv_1 = keras.layers.convolutional.Convolution2D(
        16, 8, 8, subsample=(4, 4), activation='relu'
    )(normalized)
    # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity."
    conv_2 = keras.layers.convolutional.Convolution2D(
        32, 4, 4, subsample=(2, 2), activation='relu'
    )(conv_1)
    # Flattening the second convolutional layer.
    conv_flattened = keras.layers.core.Flatten()(conv_2)
    # "The final hidden layer is fully-connected and consists of 256 rectifier units."
    hidden = keras.layers.Dense(256, activation='relu')(conv_flattened)
    # "The output layer is a fully-connected linear layer with a single output for each valid action."
    output = keras.layers.Dense(n_actions)(hidden)
    # Finally, we multiply the output by the mask!
    filtered_output = keras.layers.merge([output, actions_input], mode='mul')

    model = keras.models.Model(input=[frames_input, actions_input], output=filtered_output)
    optimizer = optimizer=keras.optimizers.RMSprop(lr=0.00025, rho=0.95, epsilon=0.01)
    model.compile(optimizer, loss='mse')

    model = Sequential()

    # Configure a convnet with 3 layers of convolutions and max pooling.
    # TODO: Figure out input shape.
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))

    # Add layers to flatten the 2D image and then do a 10-way classification.
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(10, activation='softmax'))

    model.summary()

    return model