In [1]:
# import necessary modules from keras
from keras.layers import Dense,Softmax
from keras.layers import Conv2D,Flatten,MaxPooling2D
from keras.models import Sequential

# creates a generic neural network architecture
model = Sequential()

# hidden layer takes a pre-processed frame as input, and has 200 units
model.add(Conv2D(filters = 1,
                         kernel_size = 5,
                         strides = 1,
                         activation = 'relu',
                         input_shape = (80,80,1)))
model.add(MaxPooling2D(pool_size = 2, strides = 2))
model.add(Conv2D(filters = 1,
                         kernel_size = 5,
                         strides = 1,
                         activation = 'relu',
                         input_shape = (38,38,1)))
model.add(MaxPooling2D(pool_size = 2, strides = 2))
model.add(Conv2D(filters = 1,
                         kernel_size = 5,
                         strides = 1,
                         activation = 'relu',
                         input_shape = (17,17,1)))
model.add(Flatten())
model.add(Dense(units=200,activation='relu', kernel_initializer='glorot_uniform'))
# model.add(Dense(units=200,input_dim=200, activation='relu', kernel_initializer='glorot_uniform'))

# output layer
model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal'))
# model.add(Softmax())

# compile the model using traditional Machine Learning losses and optimizers
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [2]:
import numpy as np
import gym
from gym import wrappers

# gym initialization
env = gym.make("Pong-v0")
env = wrappers.Monitor(env, 'tmp/pong-base', force=True)
observation = env.reset()
prev_input = None

# Macros
UP_ACTION = 2
DOWN_ACTION = 3
STOP = 0
act_space = {0:2,1:3,2:0}
# Hyperparameters
gamma = 0.99

# initialization of variables used in the main loop
x_train, y_train, rewards = [],[],[]
reward_sum = 0
episode_nb = 0

In [3]:
from karpathy import prepro, discount_rewards

# main loop
while (True):
    env.render()
    # preprocess the observation, set input as difference between images
    cur_input = prepro(observation)
    x = cur_input - prev_input if prev_input is not None else np.zeros(80 * 80)
    x = np.resize(x,(80,80))
    x = np.expand_dims(x, axis=0).T
#     print(x.shape)
    prev_input = cur_input
    
    # forward the policy network and sample action according to the proba distribution
    proba = model.predict(np.expand_dims(x, axis=0).T)
    action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION
    y = 1 if action == 2 else 0 # 0 and 1 are our labels

    # log the input and label to train later
    x_train.append(x)
    y_train.append(y)

    # do one step in our environment
    observation, reward, done, info = env.step(action)
    rewards.append(reward)
    reward_sum += reward
    
    # end of an episode
    if done:
        print('At the end of episode', episode_nb, 'the total reward was :', reward_sum)
        
        # increment episode number
        episode_nb += 1
#         test=np.array(x_train)
#         testy = np.array(y_train)
#         print(test.shape,testy)
        
        # training
        model.fit(x=np.array(x_train), y=np.array(y_train), verbose=1, sample_weight=discount_rewards(rewards, gamma))
                                                             
        # Reinitialization
        x_train, y_train, rewards = [],[],[]
        observation = env.reset()
        reward_sum = 0
        prev_input = None



At the end of episode 0 the total reward was : -21.0
At the end of episode 1 the total reward was : -20.0
At the end of episode 2 the total reward was : -21.0
At the end of episode 3 the total reward was : -21.0
At the end of episode 4 the total reward was : -21.0
At the end of episode 5 the total reward was : -20.0
At the end of episode 6 the total reward was : -21.0
At the end of episode 7 the total reward was : -21.0
At the end of episode 8 the total reward was : -21.0
At the end of episode 9 the total reward was : -21.0
At the end of episode 10 the total reward was : -21.0
At the end of episode 11 the total reward was : -21.0
At the end of episode 12 the total reward was : -21.0
At the end of episode 13 the total reward was : -21.0
At the end of episode 14 the total reward was : -21.0
At the end of episode 15 the total reward was : -21.0
At the end of episode 16 the total reward was : -21.0
At the end of episode 17 the total reward was : -21.0
At the end of episode 18 the total rew

KeyboardInterrupt: 