In [1]:
import tensorflow as tf
from keras.layers import Input,Conv2D,BatchNormalization,Activation,Flatten,Dense
from keras.models import Model
from keras.optimizers import RMSprop
from keras import backend as K

import cv2 as cv
import gym
import gym_chrome_dino
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
import threading

%run Image_Processing

Using TensorFlow backend.


In [2]:
class A3C:
    
    def __init__(self,input_shape,output_shape,gamma,threads,environment):
        self.input_shape_ = input_shape
        self.output_shape_ = output_shape
        self.gamma_ = gamma
        self.threads_ = threads
        self.environment_ = environment
        
        self.actor_,self.critic_ = self.__build_actor_critic__()
        
        self.sess_ = tf.InteractiveSession()
        K.set_session(self.sess_)
        self.sess_.run(tf.global_variables_initializer())
        
        
        
    def __build_actor_critic__(self):
        input_ = Input(shape=self.input_shape_)
        conv = Conv2D(filters=32,kernel_size=(8,8),padding="valid",strides=(2,2))(input_)
        conv = BatchNormalization()(conv)
        conv = Activation('relu')(conv)
        conv = Conv2D(filters=32,kernel_size=(8,8),padding="valid",strides=(2,2))(conv)
        conv = BatchNormalization()(conv)
        conv = Activation('relu')(conv)
        conv = Conv2D(filters=32,kernel_size=(8,8),padding="valid",strides=(2,2))(conv)
        conv = BatchNormalization()(conv)
        conv = Activation('relu')(conv)
        
        flat = Flatten()(conv)
        
        dense = Dense(units=512)(flat)
        dense = Activation('relu')(dense)
        dense = Dense(units=128)(dense)
        dense = Activation('relu')(dense)
        
        policy = Dense(units=self.output_shape_, activation='softmax')(dense)
        value = Dense(units=1,activation='linear')(dense)
        
        actor = Model(inputs=input_,outputs=policy)
        critic = Model(inputs=input_,outputs=value)
        
        actor._make_predict_function()
        critic._make_predict_function()
        
        return actor,critic
    
    
    
    def __update_actor__(self):
        action = K.placeholder(shape=(None, self.output_shape_))
        advantages = K.placeholder(shape=(None, ))
        policy = self.actor_.output
        
        good_prob = K.sum(action * policy, axis=1)
        eligibility = K.log(good_prob+1e-10) * K.stop_gradient(advantages)
        loss = -K.sum(eligibility)
        
        gradient = RMSprop()
        updates = gradient.get_updates(self.actor_.trainable_weights,[],loss)
        train = K.function([self.actor_.input,action,advantages],[self.actor_.output],updates=updates)
        
        return train
    
    
    
    def __update_critic__(self):
        discounted_rewards = K.placeholder(shape=(None, ))
        value = self.critic_.output
        
        loss = K.mean(K.square(discounted_rewards - value))
        
        gradient = RMSprop()
        updates = gradient.get_updates(self.critic_.trainable_weights, [], loss)
        train = K.function([self.critic_.input, discounted_rewards],[self.critic_.output],updates=updates)
        
        return train
    
    
    
    def train(self, max_episode, total_reward):
        dinos = [Dino(self.input_shape_, self.output_shape_, self.environment_, self.actor_, self.critic_,
                     self.__update_actor__(), self.__update_critic__(), self.gamma_, self.sess_,
                      max_episode, total_reward) for i in range(self.threads_)]
        for dino in dinos:
            dino.start()

    
    
    
    
    
class Dino(threading.Thread):
    
    def __init__(self,input_shape,output_shape,environment,global_actor,global_critic,
                update_actor,update_critic,gamma,sess,max_episode,total_rewards):
        
        threading.Thread.__init__(self)
        
        self.input_shape_ = input_shape
        self.output_shape_ = output_shape
        self.environment_ = environment
        self.global_actor_ = global_actor
        self.global_critic_ = global_critic
        self.update_actor_ = update_actor
        self.update_critic_ = update_critic
        self.gamma_ = gamma
        self.sess_ = sess
        self.max_episode_ = max_episode
        self.total_rewards_ = total_rewards
        
        self.states_, self.actions_, self.rewards_ = deque(),deque(),deque()
        
        
        
    def run(self):
        e = 0
        while e != self.max_episode_:
            done = False
            total_reward = 0
            state = self.environment_.reset()
            state = self.__process_initial_state__(state)
            while not done:
                action = self.__action__(state)
                next_state,reward,done,info = self.environment_.step(action)
                total_reward += reward
                state = self.__process_new_state__(state,next_state)
                if done:
                    e += 1
                    print("episode: {}/{}\n reward: {}".format(e+1,self.max_episode,total_reward))
                    self.total_rewards.append(total_reward)
                    self.__update_actor_critic__(total_reward > 600)
        self.__plot_total_rewards__()
        self.__save_weights__("{}".format(self.max_episodes_))
        
    
    
    def __process_initial_state__(self, state):
        canny = img_processing("canny",self.input_shape_[0],self.input_shape_[0],state)
        stacked = stack_images([canny,canny,canny,canny],4)
        reshaped_state = reshape_to(4,stacked)
        return reshaped_state
    
    
    
    def __process_new_state__(self, stacked_state, next_state):
        canny = img_processing("canny",self.input_shape_[0],self.input_shape_[0],next_state)
        new_stacked = fifo_images(next_state,stacked_state)
        return new_stacked
    
    
    
    def __action__(self,state):
        policy = self.global_actor_.predict(state)
        a = np.random.choice(self.output_shape_,1,p=policy)
        return a
    
    
    
    def __discounted_rewards__(self, rewards, done):
        discounted_rewards = np.zeros_like(rewards)
        R = self.global_critic_.predict(self.states_[-1]) * (1-int(done))
        
        for i in reversed(range(len(rewards))):
            R = rewards[i] + self.gamma_ * R
            discounted_rewards[i] = R
        
        return discounted_rewards
    
    
    
    def __update_actor_critic__(self, done):
        states = self.states_
        actions = self.actions_
        value = self.global_critic_.predict(states)
        discounted_rewards = self.__discounted_rewards__(self.rewards_, done)
        advantages = discounted_rewards - value
        
        self.update_actor_(states,actions,advantages)
        self.update_critic_(states,discounted_rewards)
        self.__clear_deque__()
        
        
        
    def __clear_deque__(self):
        self.states_.clear()
        self.actions_.clear()
        self.rewards_.clear()
        
        
        
    def __remember__(self,state,action,reward):
        self.states_.append(state)
        act = np.zeros(self.output_shape_)
        act[action] = 1
        self.actions_.append(act)
        self.rewards_.append(reward)
        
        
        
    def __save_weights__(self,name):
        self.global_actor_.save_weights(name+"_actor.h5")
        self.global_critic_.save_weights(name+"_critic.h5")
        
        
        
    def __plot_total_rewards__(self):
        plt.figure(figsize=(10,8))
        plt.title("Reward / Episode")
        plt.xlabel("epochs")
        plt.ylabel("rewards")
        plt.plot(self.total_rewards_, label="rewards")
        plt.legend()
        plt.savefig("plot.png")


In [3]:
environment = gym.make("ChromeDinoNoBrowser-v0")

In [4]:
input_shape = (80,80,4)
output_shape = environment.action_space.n
gamma = 1
threads = 1

max_episode = 100
total_rewards = []

In [5]:
a3c = A3C(input_shape, output_shape, gamma, threads, environment)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



In [6]:
a3c.train(max_episode, total_rewards)

Exception in thread Thread-6:
Traceback (most recent call last):
  File "C:\Users\CSY\Anaconda3\envs\env1\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-ef37eab7c6f6>", line 124, in run
    action = self.__action__(state)
  File "<ipython-input-2-ef37eab7c6f6>", line 154, in __action__
    policy = self.global_actor_.predict(state)
  File "C:\Users\CSY\Anaconda3\envs\env1\lib\site-packages\keras\engine\training.py", line 1462, in predict
    callbacks=callbacks)
  File "C:\Users\CSY\Anaconda3\envs\env1\lib\site-packages\keras\engine\training_arrays.py", line 324, in predict_loop
    batch_outs = f(ins_batch)
  File "C:\Users\CSY\Anaconda3\envs\env1\lib\site-packages\tensorflow_core\python\keras\backend.py", line 3473, in __call__
    self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
  File "C:\Users\CSY\Anaconda3\envs\env1\lib\site-packages\tensorflow_core\python\keras\backend.py", line 3410, in _make_callable
    callable_fn

In [1]:
import numpy as np

In [22]:
a = np.arange(5)
b = np.array([[2,3,4,5,6]]).T

In [28]:
a.shape

(5,)

In [29]:
b.shape

(5, 1)

In [31]:
a

array([0, 1, 2, 3, 4])

In [32]:
b

array([[2],
       [3],
       [4],
       [5],
       [6]])

In [30]:
b-a

array([[ 2,  1,  0, -1, -2],
       [ 3,  2,  1,  0, -1],
       [ 4,  3,  2,  1,  0],
       [ 5,  4,  3,  2,  1],
       [ 6,  5,  4,  3,  2]])