In [1]:
import tensorflow as tf
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

seed = 2021
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False, rng=seed)  # environment interface to game
env.reset_game()
game.getGameState()  

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


{'player_y': 256,
 'player_vel': 0,
 'next_pipe_dist_to_player': 309.0,
 'next_pipe_top_y': 127,
 'next_pipe_bottom_y': 227,
 'next_next_pipe_dist_to_player': 453.0,
 'next_next_pipe_top_y': 95,
 'next_next_pipe_bottom_y': 195}

In [3]:
import math
import copy
def TA_state():
    state = copy.deepcopy(game.getGameState())
    
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']
    relative_state = list(state.values())


    # return the state in tensor type, with batch dimension
    relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
    relative_state = tf.expand_dims(relative_state, axis=0)
    
    return relative_state

print(TA_state())
state_numpy = TA_state().numpy()
print(state_numpy)
print((state_numpy[0][3]+state_numpy[0][4])/2)

tf.Tensor([[ 256.    0.  309. -129.  -29.  453. -161.  -61.]], shape=(1, 8), dtype=float32)
[[ 256.    0.  309. -129.  -29.  453. -161.  -61.]]
-79.0


In [4]:
import moviepy.editor as mpy
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

We use "Actor-Crtic" to train out model. Here are models information.

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class PG_model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.d3 = tf.keras.layers.Dense(128,activation='relu')
        self.d4 = tf.keras.layers.Dense(64,activation='relu')
        self.d5 = tf.keras.layers.Dense(32,activation='relu')
        self.d6 = tf.keras.layers.Dense(16,activation='relu')
        self.out = tf.keras.layers.Dense(2,activation='softmax')
        self._set_inputs(tf.keras.Input(shape=[8], dtype=tf.float32))

    def call(self, input_data):
        x = input_data
        x = self.d3(x)
        x = self.d4(x)
        x = self.d5(x)
        x = self.d6(x)
        x = self.out(x)
        return x

class critic_model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.d0 = tf.keras.layers.Dense(128,activation='relu')
        self.d1 = tf.keras.layers.Dense(64,activation='relu')
        self.d2 = tf.keras.layers.Dense(32,activation='relu')
        self.d3 = tf.keras.layers.Dense(16,activation='relu')
        self.d4 = tf.keras.layers.Dense(8,activation='relu')
        self.a = tf.keras.layers.Dense(1,activation = None)
        self._set_inputs(tf.keras.Input(shape=[8], dtype=tf.float32))

    def call(self, input_data):
        x = input_data
        x = self.d0(x)
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        x = self.d4(x)
        a = self.a(x)
        return a

Loading model

In [51]:
online_agent = PG_model()
critic_agent = critic_model()
# online_agent = tf.keras.models.load_model("./saved_model/policy_gradient_200_0")
# critic_agent = tf.keras.models.load_model("./saved_model/critic_agent_200_0")

state = TA_state()
action_prob = online_agent(state)
print(action_prob[0])

tf.Tensor([0.21632096 0.783679  ], shape=(2,), dtype=float32)


Traing process.

In [54]:
# lr = 1e-5
# lr = 1e-7
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
optimizer_2 = tf.keras.optimizers.Adam(learning_rate=5e-5)
average_loss = tf.keras.metrics.Mean(name='loss')
discount_factor=0.99

def a_c_loss(prob, action, reward): 
    log_prob = tf.math.log(prob[0][action])
#     print(prob)
    loss = -log_prob * reward - 1
    return loss
        
def train_step_actor_critic(state, action, reward, next_state):
    with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
        p = online_agent(state, training=True)
        v = critic_agent(state,training=True)
        vn = critic_agent(next_state, training=True)
        td = reward + discount_factor*vn - v
        a_loss = a_c_loss(p, action, td)
        c_loss = td**2
    grads1 = tape1.gradient(a_loss, online_agent.trainable_variables)
    grads2 = tape2.gradient(c_loss, critic_agent.trainable_variables)
    optimizer.apply_gradients(zip(grads1, online_agent.trainable_variables))
    optimizer_2.apply_gradients(zip(grads2, critic_agent.trainable_variables))
    average_loss.update_state(a_loss)


 We add other terms to modify reward. We want the bird to keep its position between next pipe's bottom and top.

In [None]:
import random
max_episode = 10001
print_every_episode = 50
show_gif_every_episode = 5000
saved_model_episode = 200
steps = []

for episode in range(max_episode):
  
    env.reset_game()
    total_reward = 0
    frames = [env.getScreenRGB()]
    step = 0
    a_p = 0
    
    while not env.game_over():
        state = TA_state()
        action_prob = online_agent(state)
        action = 0
        if np.random.random_sample() > action_prob[0][0]: action = 1;
        reward = env.act(env.getActionSet()[action])
#         if not env.game_over():
        state_numpy = state.numpy()
        # method 1
#         if (state_numpy[0][3]+state_numpy[0][4])/2 < -50 and action == 1:  reward -= 3
#         elif (state_numpy[0][3]+state_numpy[0][4])/2 > 50 and action == 0:  reward -= 3
#         if ((state_numpy[0][3]+state_numpy[0][4])/2 > 35 or (state_numpy[0][3]+state_numpy[0][4])/2 < -35):  reward -= 1
        # method 2
        if (state_numpy[0][3]+state_numpy[0][4])/2 < -15 and action == 1:  
            reward -= abs((state_numpy[0][3]+state_numpy[0][4])/2)/20
        elif (state_numpy[0][3]+state_numpy[0][4])/2 > 15 and action == 0:
            reward -= abs((state_numpy[0][3]+state_numpy[0][4])/2)/20

        
        frames.append(env.getScreenRGB())  

        total_reward += reward
        step += 1
        a_p = action_prob
        
        train_step_actor_critic(state, action, reward, TA_state())
        
#         if(step > 500): break

    steps.append(step)
    
    if episode % saved_model_episode == 0 and episode != 0:
        path = "./saved_model/policy_gradient_" + str(episode)
        online_agent.save(path)
        path = "./saved_model/critic_agent_" + str(episode)
        critic_agent.save(path)
    if episode % print_every_episode == 0:
        print("[{}] time live:{}, cumulated reward: {}, loss: {:.5f}, action prob: {}"\
              .format(episode, step, total_reward, average_loss.result(), a_p))
        print(steps)
        steps = []
        average_loss.reset_states()
    if episode % show_gif_every_episode == 0:
        print("len frames:", len(frames))
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))




INFO:tensorflow:Assets written to: ./saved_model/policy_gradient_0/assets


INFO:tensorflow:Assets written to: ./saved_model/critic_agent_0/assets
[0] time live:62, cumulated reward: -34.3, loss: -1.61027, action prob: [[0.53031075 0.46968928]]
[62]
len frames: 63
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                             

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[50] time live:187, cumulated reward: -4.85, loss: -1.09868, action prob: [[7.333202e-04 9.992667e-01]]
[62, 62, 73, 100, 67, 64, 62, 62, 69, 98, 67, 98, 64, 62, 134, 62, 65, 70, 109, 109, 70, 148, 62, 105, 62, 69, 54, 62, 99, 98, 62, 67, 136, 147, 98, 74, 62, 98, 134, 74, 74, 69, 62, 68, 113, 182, 180, 105, 62, 187]
[100] time live:223, cumulated reward: -0.8499999999999996, loss: -1.01617, action prob: [[0.99264246 0.00735754]]
[139, 98, 70, 62, 297, 108, 141, 187, 134, 98, 144, 69, 525, 288, 98, 212, 66, 370, 62, 401, 98, 211, 360, 299, 550, 140, 823, 62, 68, 98, 134, 252, 247, 175, 247, 68, 1130, 62, 224, 181, 219, 256, 595, 671, 304, 99, 413, 105, 186, 223]
[150] time live:66, cumulated reward: -4.0, loss: -1.00314, action prob: [[9.6564395e-07 9.9999905e-01]]
[219, 296, 70, 145, 175, 489, 134, 360, 98, 72, 62, 215, 175, 67, 98, 360, 327, 62, 114, 142, 211, 62, 62, 362, 1341, 143, 182, 98, 211, 219, 143, 822, 62, 141, 100, 98, 98, 179, 101, 477, 62, 62, 146, 211, 586, 324, 437, 62

We first tried "REINFORCE" and "REINFORCE with baseline", but they didn't work well. After that, we used "Actor Critic" and it improved a little bit. However, the result was still not satiafying enough. The most beneficial way we used was "reward shaping", it helped regulate the birds position a lot and thus it could fly pass 5 pipes.