In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import tqdm as tqdm
import tensorflow_probability as tfp
import random as rand
from PIL import Image, ImageDraw
import numpy as np
import datetime
from blurr import to_blurred
import gym
import threading, queue
from buffer import Buffer
from dqn import DQN
from sample_trajectory import create_trajectory_thread,create_trajectory

In [2]:
BUFFER_SIZE = 2000
MIN_ELEMENTS_IN_BUFFER = 1000
THREADS = 10
INNER_ITS = 50
EPSILON_DECAY = 0.001

In [3]:
model = DQN(9)
model_target = DQN(9)
buffer = Buffer(BUFFER_SIZE, MIN_ELEMENTS_IN_BUFFER)

# initialize weights 
model(tf.random.uniform(shape=(1,84,84,4)))
model_target(tf.random.uniform(shape=(1,84,84,4)))
model_target.set_weights(np.array(model.get_weights(),dtype = object))

# initialize buffer
buffer.fill(THREADS,create_trajectory_thread,model,1)

optimizer = tf.keras.optimizers.Adam(0.00025, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
epsilon = 1

# https://www.tensorflow.org/tensorboard/get_started
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
dqn_log_dir = 'logs/asterix_test/run1/' + current_time + '/dqn'
reward_log_dir = 'logs/asterix_test/run1/' + current_time + '/reward'
dqn_summary_writer = tf.summary.create_file_writer(dqn_log_dir)
reward_summary_writer = tf.summary.create_file_writer(reward_log_dir)


for i in range(12000):

    # apply polyak averaging
    model_target.set_weights((1-0.1)*np.array(model_target.get_weights(),dtype = object) + 0.1*np.array(model.get_weights(),dtype = object))

    # sample new trajectory
    new_data = create_trajectory(model,False,epsilon)
    if epsilon > 0.1:
        epsilon -= EPSILON_DECAY
    reward = []
    for s,a,r,new_s,done in new_data:
        reward.append(tf.cast(r,tf.float32))
    print("round: ", i," average reward: ",tf.reduce_mean(reward))

    # log average reward in tensorboard
    with reward_summary_writer.as_default():
        tf.summary.scalar("reward", tf.reduce_mean(reward), step = i*INNER_ITS)

    # add new data to replay buffer
    buffer.extend(new_data)

    for j in range(INNER_ITS):
        s,a,r,s_new,done  = buffer.sample_minibatch(512)
        loss = model.step(s,a,r,s_new,done, optimizer, model_target)

        # log loss in tensorboard
        with dqn_summary_writer.as_default():
            tf.summary.scalar("dqn", loss, step=j+i*INNER_ITS)

    model.save_weights("./asterix_test/run1/model")
    model_target.save_weights("./asterix_test/run1/model_target")

Filling buffer:  0
Filling buffer:  54
Filling buffer:  108
Filling buffer:  175
Filling buffer:  249
Filling buffer:  320
Filling buffer:  400
Filling buffer:  480
Filling buffer:  563
Filling buffer:  616
Filling buffer:  670
Filling buffer:  724
Filling buffer:  779
Filling buffer:  917
Filling buffer:  981
round:  0  average reward:  tf.Tensor(3.164557, shape=(), dtype=float32)
round:  1  average reward:  tf.Tensor(3.271028, shape=(), dtype=float32)
round:  2  average reward:  tf.Tensor(1.1235955, shape=(), dtype=float32)


KeyboardInterrupt: 