In [None]:
import gym
import tensorflow as tf
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from IPython.display import clear_output
from gym import wrappers
import random

random.seed()
%matplotlib inline
# base code originally from udacity-deep-learning/reinforcement/Q-learning-cart.ipynb

# Restricts to running on a single GPU, in this case the second GPU ("1")
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

# Create class QNetwork
class QNetwork:
    def __init__(self, \
                 learning_rate=0.01, \
                 state_size=4, 
                 action_size=2, \
                 hidden_size=10, \
                 hidden_layers=2, \
                 alpha=0., \
                 name='QNetwork'):
        
        # create Q Network
        with tf.variable_scope(name):
            # placeholder for input states
            self.inputs_ = tf.placeholder(tf.float32, \
                                          [None, state_size], \
                                          name='inputs')
            
            # placeholder for actions, to be one-hot encoded next
            self.actions_ = tf.placeholder(tf.int32, \
                                           [None], \
                                           name='actions')
            
            # placeholder for next state
            self.next_state_ = tf.placeholder(tf.float32,\
                                             [None, state_size],\
                                             name='next_state')
            
            # one hot encode actions
            one_hot_actions = tf.one_hot(self.actions_, \
                                         action_size)
            
            # placeholder for target Qs
            self.targetQs_ = tf.placeholder(tf.float32, \
                                            [None], \
                                            name='target')
            
                
            # Q Value network :
            self.fc1 = tf.layers.dense(self.inputs_, \
                                        hidden_size,\
                                        activation=None,\
                                        kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.fc1 = tf.maximum(alpha*self.fc1,self.fc1)
            
            self.fc2 = tf.layers.dense(self.fc1, hidden_size,\
                                            activation=None,\
                                            kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.fc2 = tf.maximum(alpha*self.fc2,self.fc2)
                
            out_layer = self.fc2
            

            # Predict next state network :
            self.pred_input = tf.concat([self.inputs_,one_hot_actions],1)
            
            self.pred_fc1 = tf.layers.dense(self.pred_input, \
                                           hidden_size,\
                                           activation=None,\
                                           kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.pred_fc1 = tf.maximum(alpha*self.pred_fc1,self.pred_fc1)
            
            self.pred_fc2 = tf.layers.dense(self.pred_fc1, \
                                           hidden_size,\
                                           activation=None,\
                                           kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.pred_fc2 = tf.maximum(alpha*self.pred_fc2,self.pred_fc2)
            
            
            
            # Linear output layer
            self.output = tf.layers.dense(out_layer, action_size, \
                                          activation=None,\
                                          kernel_initializer=tf.contrib.layers.xavier_initializer())
            # Linear output layer for next state prediction
            self.prediction_state = tf.layers.dense(self.pred_fc2, state_size, \
                                                    activation=None,\
                                                    kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            ### Train with loss (targetQ - Q)^2
            self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            
            # loss and optimizer for Q network
            self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
            
            # loss and optimizer for next state network
            self.prediction_loss = tf.reduce_mean(tf.square(self.next_state_ - self.prediction_state))
            self.pred_opt = tf.train.AdamOptimizer(learning_rate).minimize(self.prediction_loss)


            
# create memory class for storing previous experiences
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N 

def normalize_state(x, denormalize=False):
    normalizer = [2.,3.,0.3,2.]
    if denormalize:
        y = x * normalizer
    else:
        
        y = x / normalizer
    return y

def predict_next_state(mainQN,state,sess):
    Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: state})
    action = np.argmax(Qs)
    pred_state = sess.run(mainQN.prediction_state, 
                          feed_dict={mainQN.inputs_: state,
                                    mainQN.actions_: np.expand_dims(action, axis=0)})

    return pred_state

def initialize_memory_rand_states(memory_size=1000,pretrain_length=32):
    
    # Initialize the simulation
    # Make a random action
    env.reset()
    action = env.action_space.sample()
    state, reward, done, _ = env.step(action)
    state = normalize_state(state)
    
    memory = Memory(max_size=memory_size)

    # Make a bunch of random actions and store the experiences
    ii = 0
    while ii < pretrain_length or not done:
        
        # Make a random action
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        action = action[0]
        if done:
            # The simulation fails so no next state
            next_state = np.zeros(state.shape)
            
            # Add experience to memory
            memory.add((state, action, reward, next_state))

            # Start new episode
            state = env.reset()
            state = normalize_state(state)

        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state))
            state = next_state
        
        ii = ii + 1
            
    return memory

def train_q_network(mainQN,\
                    memory,\
                    train_episodes=1500,\
                    gamma=0.99,\
                    explore_start=1.0,\
                    explore_stop=0.01,\
                    decay_rate=0.0001,\
                    batch_size=32,\
                    max_steps=500,\
                    verbose=True):
    
    
    state = env.reset()
    state = normalize_state(state)
    
    # Now train with experiences
    saver = tf.train.Saver()
    rewards_list = []
    with tf.Session() as sess:
        # Initialize variables
        sess.run(tf.global_variables_initializer())

        step = 0
        steps_list = []
        
        for ep in range(train_episodes):
            total_reward = 0
            t = 0
            
            while t < max_steps:
                step += 1


                # Explore or Exploit
                explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
                if explore_p > np.random.rand():
                    # Make a random action
                    #action = env.action_space.sample()
                    action = random.randint(0,1)
                else:
                    # Get action from Q-network
                    feed = {mainQN.inputs_: state.reshape((1, *state.shape))}
                    Qs = sess.run(mainQN.output, feed_dict=feed)
                    action = np.argmax(Qs)



                #action is first
                pred_action = []
                pred_action.append(action)
                
                #predict next state from this action
                next_state = state.reshape((1, *state.shape))
                for iterator in range(5):
                    #predict next action from this state
                    next_state = predict_next_state(mainQN,next_state,sess)
                    pred_action.append(normalize_state(next_state,denormalize=True))
                                                
                
                                   
                                   
                # Take action, get new state and reward
                next_state, reward, done, _ = env.step(pred_action)
                next_state = normalize_state(next_state)
                
                total_reward += reward
                if done:
                    t = t+1
                    # the episode ends so no next state
                    next_state = np.zeros(state.shape)
                    steps_list.append(total_reward)
                    t = max_steps

                    # Add experience to memory
                    memory.add((state, action, reward, next_state))
                    state = env.reset()
                    state = normalize_state(state)
                else:
                    # Add experience to memory
                    memory.add((state, action, reward, next_state))
                    state = next_state
                    t += 1

                # Sample mini-batch from memory
                batch = memory.sample(batch_size)
                states = np.array([each[0] for each in batch])
                actions = np.array([each[1] for each in batch])
                rewards = np.array([each[2] for each in batch])
                next_states = np.array([each[3] for each in batch])

                # Train network

                target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})

                # Set target_Qs to 0 for states where episode ends
                episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
                target_Qs[episode_ends] = (0, 0)

                targets = rewards + gamma * np.max(target_Qs, axis=1)

                loss, _ = sess.run([mainQN.loss, mainQN.opt],
                                    feed_dict={mainQN.inputs_: states,
                                               mainQN.targetQs_: targets,
                                               mainQN.actions_: actions})
                
                pred_loss, _ = sess.run([mainQN.prediction_loss, mainQN.pred_opt],
                                       feed_dict={mainQN.inputs_: states,
                                                mainQN.actions_: actions,
                                                mainQN.next_state_: next_states})
            
            rewards_list.append((ep, total_reward))   
            runningMean = np.mean(steps_list[-100:])
            if verbose:
                
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Prediction loss: {:.4f}'.format(pred_loss),
                      'Explore P: {:.4f}'.format(explore_p),
                      'RunMean : {:.4f}'.format(runningMean))
               
            
            
        saver.save(sess, "checkpoints/cartpole.ckpt")
        return rewards_list, mainQN, saver, runningMean

def plot_rewards(rewards_list):
    eps, rews = np.array(rewards_list).T
    smoothed_rews = running_mean(rews, 10)
    plt.plot(eps[-len(smoothed_rews):], smoothed_rews)
    plt.plot(eps, rews, color='grey', alpha=0.3)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')


def generate_and_train_qnetwork(train_episodes=500,\
                   gamma=0.99,\
                   explore_start=1.0,\
                   explore_stop=0.01,\
                   decay_rate=0.0001,\
                   hidden_size=128,\
                   hidden_layers=2,\
                   learning_rate=0.0001,\
                   memory_size=10000,\
                   batch_size=32,\
                   alpha=0.1,\
                   verbose=True):
    
    # reset graph
    tf.reset_default_graph()

    mainQN = QNetwork(name='main', hidden_size=hidden_size, \
                      hidden_layers=hidden_layers, learning_rate=learning_rate, alpha=alpha)
    
    memory = initialize_memory_rand_states(memory_size=memory_size,pretrain_length=batch_size)

    
    # train q-network
    rewards_list, mainQN, saver = train_q_network(mainQN,\
                                  memory,\
                                  train_episodes = train_episodes, \
                                  gamma=gamma,\
                                  explore_start=explore_start,\
                                  explore_stop=explore_stop,\
                                  decay_rate=decay_rate,\
                                  batch_size=batch_size,\
                                  verbose=verbose)

    if verbose:
        # plot training
        plot_rewards(rewards_list)
    
    avg_train_rewards = np.sum([each[1] for each in rewards_list]) / len(rewards_list)
    if verbose:
        print('average training reward = ',avg_train_rewards)

    
    return avg_train_rewards, mainQN, saver, len(rewards_list)


# Create the environment
env = gym.make('PredictObsCartpole-v0')

# Start monitor
env = wrappers.Monitor(env, '/tmp/PredictObsCartpole-experiment-1',force=True)

# Train network
avg_train_rewards, mainQN, saver, number_of_episodes = generate_and_train_qnetwork(train_episodes=500, verbose=True)
print('average test reward = ', avg_train_rewards,'   number of trials = ',number_of_episodes)

# Close environment
env.close()


[2017-05-27 18:52:33,632] Making new env: PredictObsCartpole-v0
[2017-05-27 18:52:34,136] Disabling video recorder because <TimeLimit<PredictObsCartpoleEnv<PredictObsCartpole-v0>>> neither supports video mode "rgb_array" nor "ansi".
[2017-05-27 18:52:34,137] Disabling video recorder because <TimeLimit<PredictObsCartpoleEnv<PredictObsCartpole-v0>>> neither supports video mode "rgb_array" nor "ansi".


Episode: 0 Total reward: 54.0 Training loss: 1.1507 Prediction loss: 0.2472 Explore P: 0.9947 RunMean : 54.0000
Episode: 1 Total reward: 29.0 Training loss: 1.1670 Prediction loss: 0.1079 Explore P: 0.9918 RunMean : 41.5000
Episode: 2 Total reward: 32.0 Training loss: 1.2318 Prediction loss: 0.1519 Explore P: 0.9887 RunMean : 38.3333


[2017-05-27 18:52:37,075] Disabling video recorder because <TimeLimit<PredictObsCartpoleEnv<PredictObsCartpole-v0>>> neither supports video mode "rgb_array" nor "ansi".


Episode: 3 Total reward: 11.0 Training loss: 1.2236 Prediction loss: 0.0682 Explore P: 0.9876 RunMean : 31.5000
Episode: 4 Total reward: 14.0 Training loss: 1.2184 Prediction loss: 0.0380 Explore P: 0.9862 RunMean : 28.0000
Episode: 5 Total reward: 60.0 Training loss: 1.3775 Prediction loss: 0.0557 Explore P: 0.9804 RunMean : 33.3333
Episode: 6 Total reward: 11.0 Training loss: 1.2719 Prediction loss: 0.0380 Explore P: 0.9793 RunMean : 30.1429
Episode: 7 Total reward: 13.0 Training loss: 1.2858 Prediction loss: 0.0159 Explore P: 0.9781 RunMean : 28.0000
Episode: 8 Total reward: 31.0 Training loss: 1.3267 Prediction loss: 0.0169 Explore P: 0.9751 RunMean : 28.3333
Episode: 9 Total reward: 20.0 Training loss: 1.3901 Prediction loss: 0.0171 Explore P: 0.9731 RunMean : 27.5000
Episode: 10 Total reward: 46.0 Training loss: 1.8072 Prediction loss: 0.0127 Explore P: 0.9687 RunMean : 29.1818
Episode: 11 Total reward: 9.0 Training loss: 3.4569 Prediction loss: 0.0660 Explore P: 0.9679 RunMean :

[2017-05-27 18:52:45,665] Disabling video recorder because <TimeLimit<PredictObsCartpoleEnv<PredictObsCartpole-v0>>> neither supports video mode "rgb_array" nor "ansi".


Episode: 22 Total reward: 50.0 Training loss: 4.4852 Prediction loss: 0.0047 Explore P: 0.9418 RunMean : 26.3478
Episode: 23 Total reward: 12.0 Training loss: 4.9922 Prediction loss: 0.0084 Explore P: 0.9407 RunMean : 25.7500
Episode: 24 Total reward: 30.0 Training loss: 3.7123 Prediction loss: 0.0037 Explore P: 0.9379 RunMean : 25.9200
Episode: 25 Total reward: 16.0 Training loss: 6.4470 Prediction loss: 0.0100 Explore P: 0.9364 RunMean : 25.5385
Episode: 26 Total reward: 19.0 Training loss: 4.9838 Prediction loss: 0.0051 Explore P: 0.9346 RunMean : 25.2963
Episode: 27 Total reward: 31.0 Training loss: 20.6207 Prediction loss: 0.0143 Explore P: 0.9318 RunMean : 25.5000
Episode: 28 Total reward: 26.0 Training loss: 21.5904 Prediction loss: 0.0062 Explore P: 0.9294 RunMean : 25.5172
Episode: 29 Total reward: 24.0 Training loss: 12.9820 Prediction loss: 0.0186 Explore P: 0.9272 RunMean : 25.4667
Episode: 30 Total reward: 18.0 Training loss: 37.1349 Prediction loss: 0.0175 Explore P: 0.92

[2017-05-27 18:52:59,818] Disabling video recorder because <TimeLimit<PredictObsCartpoleEnv<PredictObsCartpole-v0>>> neither supports video mode "rgb_array" nor "ansi".


Episode: 59 Total reward: 15.0 Training loss: 138.2730 Prediction loss: 0.0118 Explore P: 0.8758 RunMean : 22.3500
Episode: 60 Total reward: 25.0 Training loss: 27.7780 Prediction loss: 0.0092 Explore P: 0.8736 RunMean : 22.3934
Episode: 61 Total reward: 14.0 Training loss: 125.3657 Prediction loss: 0.0106 Explore P: 0.8724 RunMean : 22.2581
Episode: 62 Total reward: 12.0 Training loss: 46.8232 Prediction loss: 0.0129 Explore P: 0.8714 RunMean : 22.0952
Episode: 63 Total reward: 31.0 Training loss: 65.0295 Prediction loss: 0.0218 Explore P: 0.8687 RunMean : 22.2344
Episode: 64 Total reward: 26.0 Training loss: 1.2323 Prediction loss: 0.0024 Explore P: 0.8665 RunMean : 22.2923
Episode: 65 Total reward: 13.0 Training loss: 1.0015 Prediction loss: 0.0023 Explore P: 0.8653 RunMean : 22.1515
Episode: 66 Total reward: 21.0 Training loss: 35.4423 Prediction loss: 0.0056 Explore P: 0.8636 RunMean : 22.1343
Episode: 67 Total reward: 20.0 Training loss: 1.3215 Prediction loss: 0.0051 Explore P: 

[2017-05-27 18:53:26,169] Disabling video recorder because <TimeLimit<PredictObsCartpoleEnv<PredictObsCartpole-v0>>> neither supports video mode "rgb_array" nor "ansi".


Episode: 120 Total reward: 19.0 Training loss: 9.2740 Prediction loss: 0.0043 Explore P: 0.7624 RunMean : 22.0000
Episode: 121 Total reward: 18.0 Training loss: 12.6172 Prediction loss: 0.0066 Explore P: 0.7610 RunMean : 22.0700
Episode: 122 Total reward: 48.0 Training loss: 10.6805 Prediction loss: 0.0054 Explore P: 0.7574 RunMean : 22.0500
Episode: 123 Total reward: 15.0 Training loss: 25.4633 Prediction loss: 0.0345 Explore P: 0.7563 RunMean : 22.0800
Episode: 124 Total reward: 11.0 Training loss: 24.9549 Prediction loss: 0.0343 Explore P: 0.7555 RunMean : 21.8900
Episode: 125 Total reward: 21.0 Training loss: 0.5414 Prediction loss: 0.0049 Explore P: 0.7539 RunMean : 21.9400
Episode: 126 Total reward: 13.0 Training loss: 50.0197 Prediction loss: 0.0182 Explore P: 0.7529 RunMean : 21.8800
Episode: 127 Total reward: 39.0 Training loss: 12.3481 Prediction loss: 0.0061 Explore P: 0.7500 RunMean : 21.9600
Episode: 128 Total reward: 16.0 Training loss: 32.7006 Prediction loss: 0.0155 Exp

In [9]:

gym.upload('/tmp/PredictObsCartpole-experiment-1', api_key='sk_2nAEHbARwKPuKcao8nWRw')

[2017-05-24 19:52:17,301] Finished writing results. You can upload them to the scoreboard via gym.upload('D:\\tmp\\cartpole-experiment-1')
[2017-05-24 19:52:17,303] [CartPole-v1] Uploading 392 episodes of training data
[2017-05-24 19:52:18,901] [CartPole-v1] Uploading videos of 8 training episodes (51022 bytes)
[2017-05-24 19:52:19,329] [CartPole-v1] Creating evaluation object from /tmp/cartpole-experiment-1 with learning curve and training video
[2017-05-24 19:52:19,643] 
****************************************************
You successfully uploaded your evaluation on CartPole-v1 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_O2E4DuxjS7GAjEYSb0edqQ

****************************************************
