In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
import numpy as np
from threading import Thread, Lock
from multiprocessing import cpu_count
import copy

import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line disable pop-out window
from ple.games.flappybird import FlappyBird
from ple import PLE
# default use float32 in conda env
# tf.keras.backend.set_floatx('float64')

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


# Config

In [2]:
# set visible GPU
gpu_number = 0

#set seed gpu_number
seed = 2021

gamma = 0.99
update_interval = 5
actor_lr = 0.0005
critic_lr = 0.001
save_model_episode = 100

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[gpu_number], 'GPU')
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
CUR_EPISODE = 0

# Actor

In [4]:
class Actor:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(actor_lr)
        self.entropy_beta = 0.01

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(64, activation='relu'),
            Dense(8, activation='relu'),
            Dense(self.action_dim, activation='softmax')
        ])
    
    def compute_loss(self, actions, logits, advantages):
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        actions = tf.cast(actions, tf.int32)
        policy_loss = ce_loss(actions, logits, sample_weight=tf.stop_gradient(advantages))

        entropy_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        entropy = entropy_loss(logits, logits)

        # ppt page48 solution to pitfall:exploration
        return policy_loss - self.entropy_beta * entropy 

    def TA_state(self, game):
        bucket_range_per_feature = {
            'next_next_pipe_bottom_y': 40,
            'next_next_pipe_dist_to_player': 512,
            'next_next_pipe_top_y': 40,
            'next_pipe_bottom_y': 20,
            'next_pipe_dist_to_player': 20,
            'next_pipe_top_y': 20,
            'player_vel': 4,
            'player_y': 16
        }
        state = copy.deepcopy(game.getGameState())
        
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']

# =============================================================================
#         state_key = [k for k, v in sorted(state.items())]
#         for key in state_key:
#             state[key] = int(state[key] / bucket_range_per_feature[key])
# =============================================================================
            
        relative_state = list(state.values())


        # return the state in tensor type, with batch dimension
        relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
        relative_state = tf.expand_dims(relative_state, axis=0)
        
        return relative_state
    
    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            logits = self.model(states, training=True)
            loss = self.compute_loss(actions, logits, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

# Critic

In [5]:
class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(critic_lr)
    
    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(64, activation='relu'),
            Dense(64, activation='relu'),
            Dense(8, activation='relu'),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        # ppt page47 update fV_pi
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

# Agent

In [6]:
class Agent:
    def __init__(self):
        game = FlappyBird()
        env = PLE(game, fps=30, display_screen=False, rng=seed)  # game environment interface
        env.reset_game()

        self.state_dim = len(self.TA_state(game)[0])
        self.action_dim = len(env.getActionSet()) # number of actions

        self.global_actor = Actor(self.state_dim, self.action_dim)
        self.global_critic = Critic(self.state_dim)
        self.num_workers = cpu_count() # 16 for R7-5800X

    def TA_state(self, game):
        bucket_range_per_feature = {
            'next_next_pipe_bottom_y': 40,
            'next_next_pipe_dist_to_player': 512,
            'next_next_pipe_top_y': 40,
            'next_pipe_bottom_y': 20,
            'next_pipe_dist_to_player': 20,
            'next_pipe_top_y': 20,
            'player_vel': 4,
            'player_y': 16
        }
        state = copy.deepcopy(game.getGameState())
        
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']

# =============================================================================
#         state_key = [k for k, v in sorted(state.items())]
#         for key in state_key:
#             state[key] = int(state[key] / bucket_range_per_feature[key])
# =============================================================================
            
        relative_state = list(state.values())


        # return the state in tensor type, with batch dimension
        relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
        relative_state = tf.expand_dims(relative_state, axis=0)
        
        return relative_state

    def train(self, max_episodes=20000):
        workers = []

        for _ in range(self.num_workers):
            game = FlappyBird()
            env = PLE(game, fps=30, display_screen=False, rng=seed)  # game environment interface
            env.reset_game()

            workers.append(WorkerAgent(game, env, self.global_actor, self.global_critic, max_episodes))
            
        for worker in workers:
            worker.start()
        
        for worker in workers:
            worker.join()

# Worker Thread

In [7]:
class WorkerAgent(Thread):
    def __init__(self, game, env, global_actor, global_critic, max_episodes):
        Thread.__init__(self)
        self.lock = Lock()
        self.game = game
        self.env = env
        self.state_dim = len(self.TA_state(self.game)[0])
        self.action_dim = len(self.env.getActionSet())

        self.max_episodes = max_episodes
        self.global_actor = global_actor
        self.global_critic = global_critic
        self.actor = Actor(self.state_dim, self.action_dim)
        self.critic = Critic(self.state_dim)

        self.actor.model.set_weights(self.global_actor.model.get_weights())
        self.critic.model.set_weights(self.global_critic.model.get_weights())

    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value # estimate of fVpi(t+1)

        for k in reversed(range(0, len(rewards))):
            cumulative = gamma * cumulative + rewards[k] # ppt page 47 紅字, estimate fQpi
            td_targets[k] = cumulative
        return td_targets

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def TA_state(self, game):
        bucket_range_per_feature = {
            'next_next_pipe_bottom_y': 40,
            'next_next_pipe_dist_to_player': 512,
            'next_next_pipe_top_y': 40,
            'next_pipe_bottom_y': 20,
            'next_pipe_dist_to_player': 20,
            'next_pipe_top_y': 20,
            'player_vel': 4,
            'player_y': 16
        }
        state = copy.deepcopy(game.getGameState())
        
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']


        state_key = [k for k, v in sorted(state.items())]
        for key in state_key:
            state[key] = int(state[key] / bucket_range_per_feature[key])

        relative_state = list(state.values())


        # return the state in tensor type, with batch dimension
        relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
        relative_state = tf.expand_dims(relative_state, axis=0)
        
        return relative_state

    def train(self):
        global CUR_EPISODE

        while CUR_EPISODE < self.max_episodes:
            state_batch = []
            action_batch = []
            reward_batch = []
            episode_reward, done = 0, False
            
            # Reset the environment
            self.env.reset_game()
            state = self.TA_state(self.game)

            
            while not done:
                probs = self.actor.model.predict(
                    np.reshape(state, [1, self.state_dim]))
                action = np.random.choice(self.action_dim,p=probs[0])
                reward = self.env.act(self.env.getActionSet()[action])
                done = self.env.game_over()

                next_state = self.TA_state(self.game)  # get next state
                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])
                
                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)

                if(len(state_batch) >= update_interval or done):
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    rewards = self.list_to_batch(reward_batch)

                    next_v_value = self.critic.model.predict(next_state) # fVpi(t+1)
                    td_targets = self.n_step_td_target(rewards, next_v_value, done)
                    advantages = td_targets - self.critic.model.predict(states)

                    with self.lock:
                        actor_loss = self.global_actor.train(
                            states, actions, advantages)
                        critic_loss = self.global_critic.train(
                            states, td_targets)

                        self.actor.model.set_weights(
                            self.global_actor.model.get_weights())
                        self.critic.model.set_weights(
                            self.global_critic.model.get_weights())

                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    # td_target_batch = []
                    # advatnage_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]
            
            if CUR_EPISODE % save_model_episode == 0:
                self.global_actor.model.save("models/ep%d"%CUR_EPISODE)

            print('EP{} EpisodeReward={}'.format(CUR_EPISODE, episode_reward))
            CUR_EPISODE += 1

    def run(self):
        self.train()

# Start Training

In [None]:
agent = Agent()
agent.train()





INFO:tensorflow:Assets written to: models/ep0\assets
EP0 EpisodeReward=-5.0


Exception in thread Thread-16:
Traceback (most recent call last):
  File "c:\users\user\anaconda3\envs\tf2\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "<ipython-input-7-c821059f13f0>", line 135, in run
    self.train()
  File "<ipython-input-7-c821059f13f0>", line 110, in train
    states, actions, advantages)
  File "<ipython-input-4-d3d9c2d9ef9b>", line 63, in train
    logits = self.model(states, training=True)
  File "c:\users\user\anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1012, in __call__
    outputs = call_fn(inputs, *args, **kwargs)
  File "c:\users\user\anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\sequential.py", line 375, in call
    return super(Sequential, self).call(inputs, training=training, mask=mask)
  File "c:\users\user\anaconda3\envs\tf2\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 425, in call
    inputs, training=training, mask=mask)
  File "c:\






EP1 EpisodeReward=-5.0
EP2 EpisodeReward=-5.0
EP3 EpisodeReward=-5.0
EP4 EpisodeReward=-5.0
EP5 EpisodeReward=-5.0
INFO:tensorflow:Assets written to: models/ep0\assets
EP6 EpisodeReward=-5.0




INFO:tensorflow:Assets written to: models/ep0\assets


INFO:tensorflow:Assets written to: models/ep0\assets


EP7 EpisodeReward=-5.0
INFO:tensorflow:Assets written to: models/ep0\assets


INFO:tensorflow:Assets written to: models/ep0\assets


EP8 EpisodeReward=-5.0
EP9 EpisodeReward=-5.0
EP10 EpisodeReward=-5.0
EP11 EpisodeReward=-5.0
EP12 EpisodeReward=-5.0
EP13 EpisodeReward=-5.0
EP14 EpisodeReward=-5.0
EP15 EpisodeReward=-5.0
EP16 EpisodeReward=-5.0
EP17 EpisodeReward=-5.0
EP18 EpisodeReward=-5.0
EP19 EpisodeReward=-5.0
EP20 EpisodeReward=-5.0
EP21 EpisodeReward=-5.0
EP22 EpisodeReward=-5.0
EP23 EpisodeReward=-5.0
EP24 EpisodeReward=-5.0
EP25 EpisodeReward=-5.0
EP26 EpisodeReward=-5.0
EP27 EpisodeReward=-5.0
EP28 EpisodeReward=-5.0
EP29 EpisodeReward=-5.0
EP30 EpisodeReward=-5.0
EP31 EpisodeReward=-5.0
EP32 EpisodeReward=-5.0
EP33 EpisodeReward=-5.0
EP34 EpisodeReward=-5.0
EP35 EpisodeReward=-5.0
EP36 EpisodeReward=-5.0
EP37 EpisodeReward=-5.0
EP38 EpisodeReward=-5.0
EP39 EpisodeReward=-5.0
EP40 EpisodeReward=-5.0
EP41 EpisodeReward=-5.0
EP42 EpisodeReward=-5.0
EP43 EpisodeReward=-5.0
EP44 EpisodeReward=-5.0
EP45 EpisodeReward=-5.0
EP46 EpisodeReward=-5.0
EP47 EpisodeReward=-5.0
EP48 EpisodeReward=-5.0
EP49 EpisodeReward

# Convert the model to fit test environment
During the training process, we feed in states with bucketing. However in our test environment, we do not feed states with bucketing. Hence, we have to manually adjust the input state to fit out models.

In [None]:
class Bucket(keras.layers.Layer):
    def __init__(self):
        super(Bucket, self).__init__()
        self.bucket_range_per_feature = {
            'next_next_pipe_bottom_y': 40,
            'next_next_pipe_dist_to_player': 512,
            'next_next_pipe_top_y': 40,
            'next_pipe_bottom_y': 20,
            'next_pipe_dist_to_player': 20,
            'next_pipe_top_y': 20,
            'player_vel': 4,
            'player_y': 16
        }
        self.bucket_list = tf.constant([16,4,20,20,20,512,40,40])

    def call(self, inputs):
        

        inputs = tf.cast(tf.cast(tf.math.divide(inputs, tf.cast(self.bucket_list, tf.float32)),tf.int32),tf.float32)
        # print(inputs.values())

        return inputs

In [None]:
class Actor_save(tf.keras.Model):
    def __init__(self, state_dim, action_dim):
        super(Actor_save, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.temp_model = self.create_temp_model()
        inf_model = tf.keras.models.load_model("C:\\Users\\User\\Desktop\\comp4\\hand_bucket_model\\ep1100", compile=False) #change the path to your model path
        self.mid_model = self.create_model()S
        self.mid_model.set_weights(inf_model.get_weights())
        self.model = self.final_model()
        self.opt = tf.keras.optimizers.Adam(actor_lr)
        self.entropy_beta = 0.01


    def create_temp_model(self):
        input = Input(shape=(self.state_dim,))
        output = Bucket()(input)
        model = Model(inputs=input,outputs=output)
        return model
        # return tf.keras.Sequential([
        #     Input((self.state_dim,)),
        #     Bucket()
        # ])
    def create_model(self):
        return tf.keras.Sequential([
            Input(shape=(self.state_dim,)),
            Dense(64, activation='relu'),
            Dense(8, activation='relu'),
            Dense(self.action_dim, activation='softmax')
        ])
    
    def final_model(self):
        input = Input(shape=(self.state_dim))
        x = self.temp_model(input)
        output = self.mid_model(x)
        model = Model(input,output)
        # return tf.keras.Sequential([
        #         Input((self.state_dim,)),
        #         self.temp_model(),
        #         self.mid_model()
        # ])
        return model

    def compute_loss(self, actions, logits, advantages):
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        actions = tf.cast(actions, tf.int32)
        policy_loss = ce_loss(actions, logits, sample_weight=tf.stop_gradient(advantages))

        entropy_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        entropy = entropy_loss(logits, logits)

        # ppt page48 solution to pitfall:exploration
        return policy_loss - self.entropy_beta * entropy 
    def call(self, states):
        x = self.model(states)
        return x
    
    def TA_state(self, game):
        state = copy.deepcopy(game.getGameState())
        
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']
            
        relative_state = list(state.values())


        # return the state in tensor type, with batch dimension
        relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
        relative_state = tf.expand_dims(relative_state, axis=0)
        
        return relative_state
    
    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            logits = self.model(states, training=True)
            loss = self.compute_loss(actions, logits, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [None]:
actor_save = Actor_save(8,2)
input_array = tf.random.uniform((1,8))
input = np.reshape(input_array, [1, 8])
out = actor_save(input_array)
actor_save.save("test/ep1100")

# Report

At first, I was considering A2C. In order to fasten the speed of training, I chose to implement A3C using multithreading. I took an online github repository for reference(it was trained for Cartpole).<br>
Reference: https://github.com/marload/DeepRL-TensorFlow2 <br><br>
Since the A3C model works very well, there is no need to try other models.<br><br>
My A3C model has several hyperparameters, including gamma, update interval(determine how often a worker thread update its parameter to global agent), actor learning rate, critic learning rate, entropy beta. After tuing, I set gamma = 0.99, update_interval = 5, actor_lr = 0.0005, critic_lr = 0.001.<br>
As for the optimizer, Adam is what I chose.<br><br>
In the beginning, I did not put the values of state to the bucket. It turned out that it was not learning. Then I tried to throw the values in the bucket and it made progresses. I think it is beacuse after bucketing, the range of the input state decreases, and it makes the model to learn faster.<br><br>
When I finally have to save the model, I discovered that the test environment passed the value without using bucket. Therefore I have to convert my original model to adjust to the test environment requirement. Before pass the state to the network, I first pass the state to a layer Bucket() to do bucketing. It took me several hours to solve this problem.<br><br>
In this competition, I make good use of Python's threading to implement A3C. It is more difficult to debug, harder to track the weigths. It was a great experience for me.