In [1]:
import tensorflow as tf
import numpy as np

In [2]:
import os
os.chdir('PyGame-Learning-Environment')
print(os.getcwd())
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

C:\Users\stat_pc\Google 雲端硬碟\deeper_learning\Lab\PyGame-Learning-Environment
pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [3]:
# Define Input Size
IMG_WIDTH = 84
IMG_HEIGHT = 84
NUM_STACK = 4
# For Epsilon-greedy
MIN_EXPLORING_RATE = 0.01

In [19]:
class Agent:
    def __init__(self, name, num_action, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.model = self.build_model(name)

    def build_model(self, name):
        # input: state
        # output: each action's Q-value 
        screen_stack = tf.keras.Input(shape=[32], dtype=tf.float32)

        x = tf.keras.layers.Dense(units=64)(screen_stack)
        x = tf.keras.layers.ReLU()(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(units=128)(x)
        x = tf.keras.layers.ReLU()(x)
        Q = tf.keras.layers.Dense(self.num_action)(x)

        model = tf.keras.Model(name=name, inputs=screen_stack, outputs=Q)

        return model
    
    def loss(self, state, action, reward, tar_Q, ternimal):
        # Q(s,a,theta) for all a, shape (batch_size, num_action)
        output = self.model(state)
        index = tf.stack([tf.range(tf.shape(action)[0]), action], axis=1)
        # Q(s,a,theta) for selected a, shape (batch_size, 1)
        Q = tf.gather_nd(output, index)
        
        # set tar_Q as 0 if reaching terminal state
        tar_Q *= ~np.array(terminal)

        # loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
        loss = tf.reduce_mean(tf.square(reward + self.discount_factor * tar_Q - Q))

        return loss
    
    def max_Q(self, state):
        # Q(s,a,theta) for all a, shape (batch_size, num_action)
        output = self.model(state)

        # max(Q(s',a',theta')), shape (batch_size, 1)
        return tf.reduce_max(output, axis=1)
    
    def select_action(self, state):
        # epsilon-greedy
        if np.random.rand() < self.exploring_rate:
            action = np.random.choice(self.num_action)  # Select a random action
        else:
            output = self.model(state)

            # select action with highest action-value
            action = tf.argmax(output, axis=1)[0]

        return action
    
    def update_parameters(self, episode):
        self.exploring_rate = max(MIN_EXPLORING_RATE, min(0.5, 0.99**((episode) / 30)))

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0

In [20]:
# init agent
num_action = len(env.getActionSet())

# agent for frequently updating
online_agent = Agent('online', num_action)

# agent for slow updating
target_agent = Agent('target', num_action)
# synchronize target model's weight with online model's weight
target_agent.model.set_weights(online_agent.model.get_weights())

In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
average_loss = tf.keras.metrics.Mean(name='loss')

@tf.function
def train_step(state, action, reward, next_state, ternimal):
    # Delayed Target Network
    tar_Q = target_agent.max_Q(next_state)
    with tf.GradientTape() as tape:
        loss = online_agent.loss(state, action, reward, tar_Q, ternimal)
    gradients = tape.gradient(loss, online_agent.model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, online_agent.model.trainable_variables))
    
    average_loss.update_state(loss)

In [22]:
class Replay_buffer():
    def __init__(self, buffer_size=50000):
        self.experiences = []
        self.buffer_size = buffer_size

    def add(self, experience):
        if len(self.experiences) >= self.buffer_size:
            self.experiences.pop(0)
        self.experiences.append(experience)

    def sample(self, size):
        """
        sample experience from buffer
        """
        if size > len(self.experiences):
            experiences_idx = np.random.choice(len(self.experiences), size=size)
        else:
            experiences_idx = np.random.choice(len(self.experiences), size=size, replace=False)

        # from all sampled experiences, extract a tuple of (s,a,r,s')
        states = []
        actions = []
        rewards = []
        states_prime = []
        terminal = []
        for i in range(size):
            states.append(self.experiences[experiences_idx[i]][0])
            actions.append(self.experiences[experiences_idx[i]][1])
            rewards.append(self.experiences[experiences_idx[i]][2])
            states_prime.append(self.experiences[experiences_idx[i]][3])
            terminal.append(self.experiences[experiences_idx[i]][4])

        return states, actions, rewards, states_prime, terminal

In [23]:
# init buffer
buffer = Replay_buffer()

In [24]:
import moviepy.editor as mpy

def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [42]:
def get_state_value(state):
    value = []
    state_key = [k for k, v in sorted(state.items())]
    for key in state_key:
            value.append(np.float32(state[key]))
    return value

def frames_to_state(states):
    if(len(states)//8 == 1):
        state = (states*4)
    elif(len(states)//8 == 2):
        state = (states[:8]*2 + states[8:]*2)
    elif(len(states)//8 == 3):
        state = (states + states[16:])
    else:
        state = (input_frames[-32:])

    return state

In [56]:
from IPython.display import Image, display

update_every_iteration = 1000
print_every_episode = 500
save_video_every_episode = 5000
NUM_EPISODE = 20000
NUM_EXPLORE = 20
BATCH_SIZE = 32

iter_num = 0
for episode in range(0, NUM_EPISODE + 1):
    
    # Reset the environment
    env.reset_game()
    state = game.getGameState()  
    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]
    
    # input frame
    input_frames = get_state_value(state)
    
    # for every 500 episodes, shutdown exploration to see the performance of greedy action
    if episode % print_every_episode == 0:
        online_agent.shutdown_explore()
    
    # cumulate reward for this episode
    cum_reward = 0
    
    t = 0
    while not env.game_over():
        
        state = frames_to_state(input_frames)
        train_states = tf.convert_to_tensor([state], tf.float32)
        # feed current state and select an action
        action = online_agent.select_action(train_states)
        
        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])
        
        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())
        
        # record input frame
        input_frames.extend(get_state_value(game.getGameState()))
        
        # cumulate reward
        cum_reward += reward
        
        # observe the result
        state_prime = frames_to_state(input_frames)  # get next state
        
        # append experience for this episode
        if episode % print_every_episode != 0:
            buffer.add((state, action, reward, state_prime, env.game_over()))
        
        # Setting up for the next iteration
        state = state_prime
        t += 1
        
        # update agent
        if episode > NUM_EXPLORE and episode % print_every_episode != 0:
            iter_num += 1
            train_states, train_actions, train_rewards, train_states_prime, terminal = buffer.sample(BATCH_SIZE)
            train_states = np.asarray(train_states).reshape(-1, 32)
            train_states_prime = np.asarray(train_states_prime).reshape(-1, 32)


            # convert Python object to Tensor to prevent graph re-tracing
            train_states = tf.convert_to_tensor(train_states, tf.float32)
            train_actions = tf.convert_to_tensor(train_actions, tf.int32)
            train_rewards = tf.convert_to_tensor(train_rewards, tf.float32)
            train_states_prime = tf.convert_to_tensor(train_states_prime, tf.float32)
            terminal = tf.convert_to_tensor(terminal, tf.bool)

            train_step(train_states, train_actions, train_rewards, train_states_prime, terminal)

        # synchronize target model's weight with online model's weight every 1000 iterations
        if iter_num % update_every_iteration == 0 and episode > NUM_EXPLORE and episode % print_every_episode != 0:
            target_agent.model.set_weights(online_agent.model.get_weights())

    # update exploring rate
    online_agent.update_parameters(episode)
    target_agent.update_parameters(episode)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print(
            "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, average loss: {}".
            format(episode, t, cum_reward, online_agent.exploring_rate, average_loss.result()))
        average_loss.reset_states()

    if episode % save_video_every_episode == 0:  # for every 500 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/DQN_demo-{}.webm".format(episode), fps=60)
#         display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

Moviepy - Building video movie_f/DQN_demo-0.webm.
Moviepy - Writing video movie_f/DQN_demo-0.webm



                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-0.webm
[500] time live:62, cumulated reward: -5.0, exploring rate: 0.5, average loss: 14.165546417236328
[1000] time live:21, cumulated reward: -5.0, exploring rate: 0.5, average loss: 4.538712024688721
[1500] time live:70, cumulated reward: -4.0, exploring rate: 0.5, average loss: 5.336751937866211
[2000] time live:98, cumulated reward: -4.0, exploring rate: 0.5, average loss: 1.9761816263198853
[2500] time live:62, cumulated reward: -5.0, exploring rate: 0.43277903725889943, average loss: 1.1194506883621216
[3000] time live:62, cumulated reward: -5.0, exploring rate: 0.3660323412732292, average loss: 0.9205449819564819
[3500] time live:21, cumulated reward: -5.0, exploring rate: 0.30957986252419073, average loss: 0.9392762184143066
[4000] time live:28, cumulated reward: -5.0, exploring rate: 0.26183394327157605, average loss: 1.585087776184082
[4500] time live:107, cumulated reward: -3.0, exploring rate: 0.22145178723886091, ave

                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-5000.webm
[5500] time live:64, cumulated reward: -5.0, exploring rate: 0.15841112426184903, average loss: 0.6175907254219055
[6000] time live:45, cumulated reward: -5.0, exploring rate: 0.13397967485796172, average loss: 0.4027842581272125
[6500] time live:138, cumulated reward: -2.0, exploring rate: 0.11331624189077398, average loss: 0.2638608515262604
[7000] time live:211, cumulated reward: -1.0, exploring rate: 0.09583969128049684, average loss: 0.17209923267364502
[7500] time live:176, cumulated reward: -2.0, exploring rate: 0.08105851616218128, average loss: 0.13858462870121002
[8000] time live:4033, cumulated reward: 101.0, exploring rate: 0.0685570138491429, average loss: 0.10795757174491882
[8500] time live:1192, cumulated reward: 25.0, exploring rate: 0.05798359469728905, average loss: 0.08057218044996262
[9000] time live:1079, cumulated reward: 22.0, exploring rate: 0.04904089407128572, average loss: 0.06872694194316864


                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/DQN_demo-10000.webm
[10500] time live:905, cumulated reward: 18.0, exploring rate: 0.029670038450977102, average loss: 0.04929068684577942
[11000] time live:5841, cumulated reward: 149.0, exploring rate: 0.02509408428990297, average loss: 0.039731282740831375
[11500] time live:1377, cumulated reward: 30.0, exploring rate: 0.021223870922486707, average loss: 0.03317359834909439
[12000] time live:226, cumulated reward: 0.0, exploring rate: 0.017950553275045137, average loss: 0.027102388441562653
[12500] time live:4767, cumulated reward: 120.0, exploring rate: 0.015182073244652034, average loss: 0.02651355229318142
[13000] time live:9816, cumulated reward: 254.0, exploring rate: 0.012840570676248398, average loss: 0.028083575889468193
[13500] time live:1264, cumulated reward: 27.0, exploring rate: 0.010860193639877882, average loss: 0.0265218336135149
[14000] time live:29329, cumulated reward: 772.0, exploring rate: 0.01, average loss: 0.0218

KeyboardInterrupt: 

### 我將四個時間點的state當作input，並且放兩層dense layer，輸出跑10000個epoche的結果

In [60]:
from moviepy.editor import *
clip = VideoFileClip("movie_f/DQN_demo-10000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4
