In [6]:
# First, we import the modules we will need
import tensorflow as tf
import numpy as np
import gym
from skimage.transform import resize
from collections import deque
import random
############### GLOBAL VARIABLES ###############
N_ACTIONS = 4
LEARNING_RATE = 0.0001
BATCHSIZE = 32
NEpisodes = 100
NDict = 50000
INITIAL_EPSILON = 1.0
deltaEps  = 9e-7
gamma     = 0.99

SAVE_EVERY = 1
env = gym.make("Breakout-v0")
#############################################

############### IMAGE PREPROCESSING FUNCTIONS ###############
# Now we create functions to do some fun and intriguing image preprocessing.
# The size we will use will preserve the original aspect ratio and one of the sides will be a power of 2.
# New image size should be: 84x64.
# The new size will make our network train faster.

# function to convert image to grayscale
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

# function to downsample the image by a factor of 2.5.
def downsample(img):
    # the downsampling leads to an image of size 84x64.
    return resize(img, (img.shape[0]/2.5, img.shape[1]/2.5), anti_aliasing = True)

# function to implement all the preprocessing
def preprocess(img):
    return to_grayscale(downsample(img))

#############################################

############### MODEL ###############

# function conv2d lets us make our model less verbose
def conv2d(x, W, stride):
    return tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding="SAME")

# function maxPool lets us make our model less verbose
def maxPool(x):
    return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME")

# function wVar allows us to make our model less verbose
def wVar(shp):
    i = tf.truncated_normal(shp, stddev = 0.01)
    return tf.Variable(i)

# function bVar allows us to make our model less verbose
def bVar(shp):
    i = tf.constant(0.1, shape = shp)
    return tf.Variable(i)

# function with the convolutional network Low-End tensorflow
def ConvolutionalNeuralNetwork(pool_layer = False):
    # input of dimension 84x64x4 (4 images are used)
    inpu_ = tf.placeholder(tf.float32, [None, 84,64,4])

    # W's and b's of each layer
    WConv1 = wVar([8,8,4,32])
    bConv1 = bVar([32])

    WConv2 = wVar([4,4,32,64])
    bConv2 = bVar([64])

    WConv3 = wVar([3,3,64,64])
    bConv3 = bVar([64])

    if pool_layer == False:
        WFullCon1 = wVar([1536, 512])
        bFullCon1 = bVar([512])
    else:
        WFullCon1 = wVar([128, 128])
        bFullCon1 = bVar([128])

    # Modificar número de acciones N_ACTIONS
    # de acuerdo con el juego
    if pool_layer == False:
        WFullCon2 = wVar([512, N_ACTIONS])
        bFullCon2 = bVar([N_ACTIONS])
    else:
        WFullCon2 = wVar([128, N_ACTIONS])
        bFullCon2 = bVar([N_ACTIONS])

    # hidden layers
    if pool_layer == False:
        hConv1 = tf.nn.relu(conv2d(inpu_, WConv1, 4) + bConv1)
        hPool1  = maxPool(hConv1)
        hConv2 = tf.nn.relu(conv2d(hPool1, WConv2, 2) + bConv2)
        # hPool2 = maxPool(hConv2)
        hConv3 = tf.nn.relu(conv2d(hConv2, WConv3, 1) + bConv3)
        # hPool3  = maxPool(hConv3)
        hFlat = tf.reshape(hConv3, [-1,1536])
        hFullCon1 = tf.nn.relu(tf.matmul(hFlat, WFullCon1) + bFullCon1)
    else:
        hConv1 = tf.nn.relu(conv2d(inpu_, WConv1, 4) + bConv1)
        hPool1  = maxPool(hConv1)
        hConv2 = tf.nn.relu(conv2d(hPool1, WConv2, 2) + bConv2)
        hPool2 = maxPool(hConv2)
        hConv3 = tf.nn.relu(conv2d(hPool2, WConv3, 1) + bConv3)
        hPool3  = maxPool(hConv3)
        hFlat = tf.reshape(hPool3, [-1,128])
        hFullCon1 = tf.nn.relu(tf.matmul(hFlat, WFullCon1) + bFullCon1)

    # output layer
    output_ = tf.matmul(hFullCon1, WFullCon2) + bFullCon2
    w_and_b = {"weights":np.array([WConv1, WConv2, WConv3, WFullCon1, WFullCon2]),
                "biases":np.array([bConv1, bConv2, bConv3, bFullCon1, bFullCon2])}
    return inpu_, output_, w_and_b

#############################################
# Initialize dictionary
# Initialize D
# D contains NDict entries each entry
# is (State at t, action at t, reward at t, state at t+1)
def Dict():
    count       = 0
    done        = True
    D           = deque()
    colaEstados = deque()
    # The initial D is created from random games whose first state is
    # defined from the first frame repeated 4 times
    while(count < NDict):
        if done==True:
            env.reset()
            colaEstados = deque()
            action      = env.action_space.sample()
            a_t = np.zeros([N_ACTIONS])
            a_t[action] = 1
            new_frame, reward, done, out = env.step(action)
            #Preprocess new_frame before append
            new_frame = preprocess(new_frame)
            colaEstados.append(new_frame)
            colaEstados.append(new_frame)
            colaEstados.append(new_frame)
            colaEstados.append(new_frame)
            colaEstadosNew =  colaEstados
            D.append((colaEstados, a_t, reward, colaEstados, done))
        else:
            action  = env.action_space.sample()
            a_t = np.zeros([N_ACTIONS])
            a_t[action] = 1
            new_frame, reward, done, out = env.step(action)
            new_frame = preprocess(new_frame)
            colaEstadosNew.append(new_frame)
            colaEstadosNew.popleft()
            D.append((colaEstados, a_t, reward, colaEstadosNew, done))
            colaEstados = colaEstadosNew
        count += 1
    return D

# TODO: Create train function
def trainModel(inpu, output, weights_and_biases, session):
    a = tf.placeholder(tf.float32, [None, N_ACTIONS])
    y = tf.placeholder(tf.float32, [None])

    output_mod = tf.reduce_sum(tf.matmul(output, a, transpose_b = True), reduction_indices = 1)
    # loss function used: huber loss
    loss = tf.reduce_mean(tf.square(y - output_mod))
    adam_optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
    update_model = adam_optimizer.minimize(loss)

    D = Dict()

    # saving network
    saver = tf.train.Saver()
    session.run(tf.global_variables_initializer())

    epsilon = INITIAL_EPSILON

    # Lists for printing
    rewards_list = []
    weights_list = []
    for i in range(1,NEpisodes):
        # Define initial state for episode i
        # The initial action is always given by 0
        colaEstados = deque()
        env.reset()
        action  =0
        new_frame, reward, done, out = env.step(action)
        new_frame = preprocess(new_frame)
        colaEstados.append(new_frame)
        colaEstados.append(new_frame)
        colaEstados.append(new_frame)
        colaEstados.append(new_frame)
        colaEstadosNew = colaEstados
        
        l=0
        rewards=0
        # Define action steps until episode ends
        while done == False:
            Qout = session.run(output, feed_dict = {inpu:np.reshape(np.asarray(colaEstados),[1,84,64,4])})
            # Define whether to take random action or greedy action
            randnum = np.random.uniform(0.0,1.0)
            a_t = np.zeros([N_ACTIONS])
            if randnum < epsilon: # Take random action
                action = env.action_space.sample()
                a_t[action] = 1
            else:# Take greedy action
                action = np.argmax(Qout)
                a_t[action] = 1
            # Do action
            new_frame, reward, done, out = env.step(action)
            new_frame = preprocess(new_frame)

            # Update new state
            colaEstadosNew.append(new_frame)
            colaEstadosNew.popleft()

            # Update set in D
            D.append((colaEstados, a_t, reward, colaEstadosNew, done))
            D.popleft()

            # Update value of epsilon
            # It decreases linearly for the first 1,000,000 iterations
            # Afterwards is 0.1
            epsilon = max(-deltaEps+epsilon, 0.1)

            # Select random batch
            batch        = random.sample(D,BATCHSIZE)
            Yj           = np.zeros(BATCHSIZE)
            action_batch = [b[1] for b in batch]
            statej  = [b[0] for b in batch]
            statej1 = [b[3] for b in batch]
            
            for k in range(0, BATCHSIZE):
            # Define expected rewards for training
                if batch[k][4]==True:
                    Yj[k] = batch[k][2]
                else:
                    # Obtener el máximo de Qhat FALTA
                    Q1 = session.run(output, feed_dict = {inpu:np.reshape(np.asarray(batch[k][3]),[1,84,64,4])})
                    maxExp = np.max(Q1) # podría ser con opción axis = 1
                    Yj[k]     = batch[k][2] + gamma * maxExp
            # Update weights of Q FALTA
            update_model.run(feed_dict = {y:Yj, a:action_batch, inpu:np.reshape(np.asarray(statej),[32,84,64,4])})
            # update_model.run(feed_dict = {y:Yj, a:action_batch, inpu:np.asarray(statej)})

            # Update state to state t+1
            colaEstados = colaEstadosNew

            # We save our model every 10,000 epochs
           # if i % SAVE_EVERY == 0:
           #     saver.save(session, "models_saved/", global_step = i)
            rewards= rewards+reward
            l += 1
            #print("Step: ", l, "   Reward: ",reward)
        print("Episode: ", i, "  Total Steps:", l, "Total Reward:", rewards)
############################################
# TODO: main function
def simulate_game():
    session = tf.InteractiveSession()
    inpu, output, weights_and_biases = ConvolutionalNeuralNetwork(pool_layer = True)
    trainModel(inpu, output, weights_and_biases, session)

def main():
    simulate_game()

if __name__ == "__main__":
    main()

  warn("The default mode, 'constant', will be changed to 'reflect' in "


NameError: name 'hConv1' is not defined