In [1]:
import tensorflow as tf
import numpy as np

In [2]:
!nvidia-smi

Thu Dec 31 16:27:11 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 450.57       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  Off  | 00000000:18:00.0 Off |                  N/A |
| 31%   46C    P2    61W / 250W |   6167MiB / 11016MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
import tensorflow as tf
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
    except RuntimeError as e:
        print(e)

In [4]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [5]:
# Define Input Size
IMG_WIDTH = 84
IMG_HEIGHT = 84
NUM_STACK = 4
# For Epsilon-greedy
MIN_EXPLORING_RATE = 0.01

# Change the network structure from CNNs to Dense layers.

In [6]:
class Agent:
    def __init__(self, name, num_action, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.model = self.build_model(name)

    def build_model(self, name):
        # input: state
        # output: each action's Q-value 
        #screen_stack = tf.keras.Input(shape=[IMG_WIDTH, IMG_HEIGHT, NUM_STACK], dtype=tf.float32)
        state = tf.keras.Input(shape=[32], dtype=tf.float32)
        x = tf.keras.layers.Dense(units=32, activation=tf.nn.relu)(state)
        x = tf.keras.layers.Dense(units=64, activation=tf.nn.relu)(x)
        x = tf.keras.layers.Dense(units=128, activation=tf.nn.relu)(x)
        Q = tf.keras.layers.Dense(self.num_action)(x)

        model = tf.keras.Model(name=name, inputs=state, outputs=Q)

        return model
    
    def loss(self, state, action, reward, tar_Q, ternimal):
        # Q(s,a,theta) for all a, shape (batch_size, num_action)
        output = self.model(state)
        index = tf.stack([tf.range(tf.shape(action)[0]), action], axis=1)
        # Q(s,a,theta) for selected a, shape (batch_size, 1)
        Q = tf.gather_nd(output, index)
        
        # set tar_Q as 0 if reaching terminal state
        tar_Q *= ~np.array(terminal)

        # loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
        loss = tf.reduce_mean(tf.square(reward + self.discount_factor * tar_Q - Q))

        return loss
    
    def max_Q(self, state):
        # Q(s,a,theta) for all a, shape (batch_size, num_action)
        output = self.model(state)

        # max(Q(s',a',theta')), shape (batch_size, 1)
        return tf.reduce_max(output, axis=1)
    
    def select_action(self, state):
        # epsilon-greedy
        if np.random.rand() < self.exploring_rate:
            action = np.random.choice(self.num_action)  # Select a random action
        else:
            #state = np.expand_dims(state, axis = 0)
            # Q(s,a,theta) for all a, shape (batch_size, num_action)
            output = self.model(state)

            # select action with highest action-value
            action = tf.argmax(output, axis=1)[0]

        return action
    
    def update_parameters(self, episode):
        self.exploring_rate = max(MIN_EXPLORING_RATE, min(0.5, 0.99**((episode) / 30)))

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0

In [7]:
# init agent
num_action = len(env.getActionSet())

# agent for frequently updating
online_agent = Agent('online', num_action)

# agent for slow updating
target_agent = Agent('target', num_action)
# synchronize target model's weight with online model's weight
target_agent.model.set_weights(online_agent.model.get_weights())

In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
average_loss = tf.keras.metrics.Mean(name='loss')

@tf.function
def train_step(state, action, reward, next_state, ternimal):
    # Delayed Target Network
    tar_Q = target_agent.max_Q(next_state)
    with tf.GradientTape() as tape:
        loss = online_agent.loss(state, action, reward, tar_Q, ternimal)
    gradients = tape.gradient(loss, online_agent.model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, online_agent.model.trainable_variables))
    
    average_loss.update_state(loss)

In [9]:
class Replay_buffer():
    def __init__(self, buffer_size=50000):
        self.experiences = []
        self.buffer_size = buffer_size

    def add(self, experience):
        if len(self.experiences) >= self.buffer_size:
            self.experiences.pop(0)
        self.experiences.append(experience)

    def sample(self, size):
        """
        sample experience from buffer
        """
        if size > len(self.experiences):
            experiences_idx = np.random.choice(len(self.experiences), size=size)
        else:
            experiences_idx = np.random.choice(len(self.experiences), size=size, replace=False)

        # from all sampled experiences, extract a tuple of (s,a,r,s')
        states = []
        actions = []
        rewards = []
        states_prime = []
        terminal = []
        for i in range(size):
            states.append(self.experiences[experiences_idx[i]][0])
            actions.append(self.experiences[experiences_idx[i]][1])
            rewards.append(self.experiences[experiences_idx[i]][2])
            states_prime.append(self.experiences[experiences_idx[i]][3])
            terminal.append(self.experiences[experiences_idx[i]][4])

        return states, actions, rewards, states_prime, terminal

In [10]:
# init buffer
buffer = Replay_buffer()

In [11]:
import moviepy.editor as mpy

def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [12]:
import skimage.transform

def preprocess_screen(state):
    value = []
    state_key = [k for k, v in sorted(state.items())]
    for key in state_key:
            value.append(np.float32(state[key]))
    return value

def frames_to_state(states):
    if(len(states)//8 == 1):
        state = (states*4)
    elif(len(states)//8 == 2):
        state = (states[:8]*2 + states[8:]*2)
    elif(len(states)//8 == 3):
        state = (states + states[16:])
    else:
        state = (input_frames[-32:])

    return state

# Train the state-based DQN agent to play Flappy Bird.

## Change the input from stack of frames to game state

In [13]:
from IPython.display import Image, display
from keras import backend as K #转换为张量
import time

t1 = time.time()


update_every_iteration = 1000
print_every_episode = 500
save_video_every_episode = 5000
NUM_EPISODE = 20000
NUM_EXPLORE = 20
BATCH_SIZE = 32

iter_num = 0
for episode in range(0, NUM_EPISODE + 1):
    
    # Reset the environment
    env.reset_game()
    
    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]
    
    # input frame
    #input_frames = [preprocess_screen(env.getScreenGrayscale())]
    state = game.getGameState()
    input_frames = preprocess_screen(state)
    # for every 500 episodes, shutdown exploration to see the performance of greedy action
    if episode % print_every_episode == 0:
        online_agent.shutdown_explore()
    
    # cumulate reward for this episode
    cum_reward = 0
    
    t = 0
    while not env.game_over():
        
        #state = frames_to_state(input_frames)
        state = frames_to_state(input_frames)

        # feed current state and select an action
        action = online_agent.select_action(tf.convert_to_tensor([state], tf.float32))
        
        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])
        
        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())
        
        # record input frame
        input_frames.extend(preprocess_screen(game.getGameState()))
        
        # cumulate reward
        cum_reward += reward
        
        # observe the result
        #state_prime = frames_to_state(input_frames)  # get next state
        state_prime = frames_to_state(input_frames)
        
        # append experience for this episode
        if episode % print_every_episode != 0:
            buffer.add((state, action, reward, state_prime, env.game_over()))
        
        # Setting up for the next iteration
        state = state_prime
        t += 1
        
        # update agent
        if episode > NUM_EXPLORE and episode % print_every_episode != 0:
            iter_num += 1
            train_states, train_actions, train_rewards, train_states_prime, terminal = buffer.sample(BATCH_SIZE)
            train_states = np.asarray(train_states).reshape(-1, 32)
            train_states_prime = np.asarray(train_states_prime).reshape(-1, 32)
            
            # convert Python object to Tensor to prevent graph re-tracing
            train_states = tf.convert_to_tensor(train_states, tf.float32)
            train_actions = tf.convert_to_tensor(train_actions, tf.int32)
            train_rewards = tf.convert_to_tensor(train_rewards, tf.float32)
            train_states_prime = tf.convert_to_tensor(train_states_prime, tf.float32)
            terminal = tf.convert_to_tensor(terminal, tf.bool)
            
            train_step(train_states, train_actions, train_rewards, train_states_prime, terminal)

        # synchronize target model's weight with online model's weight every 1000 iterations
        if iter_num % update_every_iteration == 0 and episode > NUM_EXPLORE and episode % print_every_episode != 0:
            target_agent.model.set_weights(online_agent.model.get_weights())

    # update exploring rate
    online_agent.update_parameters(episode)
    target_agent.update_parameters(episode)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print(
            "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, average loss: {}".
            format(episode, t, cum_reward, online_agent.exploring_rate, average_loss.result()))
        average_loss.reset_states()

    if episode % save_video_every_episode == 0:  # for every 500 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f_Dominic/DQN_demo-{}.webm".format(episode), fps=60)
#         display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))


t2 = time.time()

print('time elapsed: ' + str(t2-t1) + ' seconds')


t:  24%|██▍       | 15/63 [00:00<00:00, 147.22it/s, now=None]

Moviepy - Building video movie_f_Dominic/DQN_demo-0.webm.
Moviepy - Writing video movie_f_Dominic/DQN_demo-0.webm



                                                             

Moviepy - Done !
Moviepy - video ready movie_f_Dominic/DQN_demo-0.webm
[500] time live:62, cumulated reward: -5.0, exploring rate: 0.5, average loss: 2.4755661487579346
[1000] time live:33, cumulated reward: -5.0, exploring rate: 0.5, average loss: 3.022477626800537
[1500] time live:62, cumulated reward: -5.0, exploring rate: 0.5, average loss: 3.8035991191864014
[2000] time live:62, cumulated reward: -5.0, exploring rate: 0.5, average loss: 2.7187771797180176
[2500] time live:62, cumulated reward: -5.0, exploring rate: 0.43277903725889943, average loss: 1.9075249433517456
[3000] time live:62, cumulated reward: -5.0, exploring rate: 0.3660323412732292, average loss: 1.6191405057907104
[3500] time live:102, cumulated reward: -3.0, exploring rate: 0.30957986252419073, average loss: 1.0628271102905273
[4000] time live:364, cumulated reward: 4.0, exploring rate: 0.26183394327157605, average loss: 0.8382558822631836
[4500] time live:247, cumulated reward: 0.0, exploring rate: 0.221451787238

t:   3%|▎         | 15/587 [00:00<00:03, 146.30it/s, now=None]

[5000] time live:586, cumulated reward: 9.0, exploring rate: 0.18729769509073985, average loss: 0.5243544578552246
Moviepy - Building video movie_f_Dominic/DQN_demo-5000.webm.
Moviepy - Writing video movie_f_Dominic/DQN_demo-5000.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f_Dominic/DQN_demo-5000.webm
[5500] time live:62, cumulated reward: -5.0, exploring rate: 0.15841112426184903, average loss: 0.49465230107307434
[6000] time live:1049, cumulated reward: 22.0, exploring rate: 0.13397967485796172, average loss: 0.44029441475868225
[6500] time live:62, cumulated reward: -5.0, exploring rate: 0.11331624189077398, average loss: 0.4174327254295349
[7000] time live:62, cumulated reward: -5.0, exploring rate: 0.09583969128049684, average loss: 0.3912205994129181
[7500] time live:62, cumulated reward: -5.0, exploring rate: 0.08105851616218128, average loss: 0.3763944208621979
[8000] time live:149, cumulated reward: -2.0, exploring rate: 0.0685570138491429, average loss: 0.34471395611763
[8500] time live:148, cumulated reward: -2.0, exploring rate: 0.05798359469728905, average loss: 0.31818342208862305
[9000] time live:3553, cumulated reward: 88.0, exploring rate: 0.04904089407128572, average loss: 0.3219783902168274


t:   2%|▏         | 14/587 [00:00<00:04, 136.14it/s, now=None]

[10000] time live:586, cumulated reward: 9.0, exploring rate: 0.03508042658630376, average loss: 0.2951175272464752
Moviepy - Building video movie_f_Dominic/DQN_demo-10000.webm.
Moviepy - Writing video movie_f_Dominic/DQN_demo-10000.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f_Dominic/DQN_demo-10000.webm
[10500] time live:324, cumulated reward: 2.0, exploring rate: 0.029670038450977102, average loss: 0.29131877422332764
[11000] time live:211, cumulated reward: -1.0, exploring rate: 0.02509408428990297, average loss: 0.2837284505367279
[11500] time live:324, cumulated reward: 2.0, exploring rate: 0.021223870922486707, average loss: 0.27060526609420776
[12000] time live:1264, cumulated reward: 27.0, exploring rate: 0.017950553275045137, average loss: 0.2118356078863144
[12500] time live:62, cumulated reward: -5.0, exploring rate: 0.015182073244652034, average loss: 0.17265497148036957
[13000] time live:149, cumulated reward: -2.0, exploring rate: 0.012840570676248398, average loss: 0.2012222856283188
[13500] time live:62, cumulated reward: -5.0, exploring rate: 0.010860193639877882, average loss: 0.1864195317029953


KeyboardInterrupt: 

#  第10000 EPISODE 的結果

## The bird is able to fly through at least 1 pipe

In [14]:
from moviepy.editor import *
clip = VideoFileClip("movie_f_Dominic/DQN_demo-10000.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

t:  16%|█▌        | 92/587 [00:00<00:00, 912.63it/s, now=None]

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4
