In [6]:
import gymnasium as gym
import tensorflow as tf
from tensorflow.keras.regularizers import l2
import numpy as np
import random
import pickle

#tf.random.set_seed(7)
#np.random.seed(42)
#random.seed(1337)
#setting seeds doesn't really work, maybe gym uses some hidden seeds?
#add env.action_space.seed(RANDOM_SEED) ??


N = 10 #outer loop
K = 3 #inner loop
MINI_BATCH_SIZE = 20
CONVERGED = False
MAX_STEPS = 4000
GAMMA = 0.99
regularization_factor = 0.001
epsilon = 1.0
minimum_epsilon = 0.01
epsilon_decay = 0.9994
NUM_ENVS = 3
BUFFER_SIZE = 100000


use_prefill = True
use_DDQN = True

In [7]:
# Define the model
def create_model(regularization_factor):
    #inputs = tf.keras.Input(shape=(4,))
    inputs = tf.keras.Input(shape=(8,)) #changed
    x = tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor))(inputs)
    x = tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor))(x)
    x = tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor))(x)
    #outputs = tf.keras.layers.Dense(2, activation="relu")(x)
    outputs = tf.keras.layers.Dense(4, activation="relu")(x) #changed
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="LunarLander")
    return model

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)



In [8]:
class Environment:
    def __init__(self, NUM_ENVS):
        self.num_envs = NUM_ENVS
        #self.envs = envs = gym.vector.make('CartPole-v1', num_envs=NUM_ENVS) #changed
        self.envs = envs = gym.vector.make('LunarLander-v2', num_envs=NUM_ENVS)
        self.current_state, _ = self.envs.reset(seed=11)

    def sample(self, model, epsilon):
        q_values = model(self.current_state) #get q values for current state
        action = np.argmax(q_values, axis=1) #get action that maximizes q-value
        action = [self.envs.single_action_space.sample() if random.random() < epsilon else a for a in action] #choose epsilon greedy
        new_observation, reward, terminated, _, _ = self.envs.step(action)
        old_observation = self.current_state

        self.current_state = new_observation #update current state after environment did step
        return (old_observation, action, reward, new_observation, terminated)


In [9]:
class Buffer:
    def __init__(self, max_size, path=None):
        self.buffer = []
        self.max_size = max_size
        if path:
            self.load_from_file(path)

    def add_to_buffer(self, samples):
        #unpack the different environments
        old_obs_ = samples[0]
        actions_ = samples[1]
        rewards_ = samples[2]
        new_obs_ = samples[3]
        terminateds_ = samples[4]
        for o, a, r, no, t in zip(old_obs_, actions_, rewards_, new_obs_, terminateds_):
            self.buffer.append((o, a, r, no, t))
        if len(self.buffer) > self.max_size:
            self.buffer = self.buffer[-self.max_size:]

    def sample_minibatch(self, batch_size):
        r = random.sample(self.buffer, min(len(self.buffer), batch_size))
        return r
    
    def load_from_file(self, path):
        with open(path, 'rb') as f:
            lst = pickle.load(f)
        self.buffer += lst
        if len(self.buffer) > self.max_size:
            self.buffer = self.buffer[-self.max_size:]
        

In [10]:
#init environments
envs = Environment(NUM_ENVS)

#just some approximatory metrics
returns = np.zeros(NUM_ENVS)
return_tracker = []

# Create the model
Q_theta = create_model(regularization_factor)
Q_target = tf.keras.models.clone_model(Q_theta)

# init buffer
kabuff = Buffer(BUFFER_SIZE)
# prefill buffer
if use_prefill:
    kabuff.load_from_file("kabuff.pkl") #kabuff.pkl contains samples derived from a previous working policy

STEPS = 0

while not CONVERGED and STEPS < MAX_STEPS:
    STEPS += 1
    
    Q_target.set_weights(Q_theta.get_weights()) #update target network to current training network interation
    epsilon = max(minimum_epsilon, epsilon*epsilon_decay) #anneal the epsilon used for sampling
    
    for n in range(N):
        
        new_samples = envs.sample(Q_theta, epsilon) #sample from the environment
        
        #update return-metrics, reset returns if terminated
        #envs.sample returns a tuple of NUM_ENVS long np.arrays corresponding to old_obs, actions, rewards, new_obs, terminated
        #we add the rewards to our return counters
        returns += new_samples[2] #new_samples[2] = rewards
        for i, t in enumerate(new_samples[4]): #new_samples[4] = terminateds
            if t:
                #environment terminated, add to tracker and reset
                return_tracker.append(returns[i])
                returns[i] = 0

        kabuff.add_to_buffer(new_samples) #add environment samples to the buffer

        for k in range(K):
            #sample s,a,r,s' minibatch from buffer
            minibatch = kabuff.sample_minibatch(MINI_BATCH_SIZE)

            #unpack the minibatch
            new_states = np.array([sample[3] for sample in minibatch])
            rewards = np.array([sample[2] for sample in minibatch])
            actions = np.array([sample[1] for sample in minibatch])
            old_states = np.array([sample[0] for sample in minibatch])
            terminateds = np.array([sample[4] for sample in minibatch])


            #DDQN
            if use_DDQN:
                Q_target_values = Q_target(new_states)
                Q_theta_values = Q_theta(new_states)
                max_Q_target_values = []
                for target_action_vals, theta_action_vals in zip(Q_target_values, Q_theta_values):
                    Q_theta_max_index = np.argmax(theta_action_vals)
                    max_Q_target_value = target_action_vals[Q_theta_max_index]
                    max_Q_target_values.append(max_Q_target_value)
                max_Q_target_values = np.array(max_Q_target_values)
            else:
                Q_target_values = Q_target(new_states)
                max_Q_target_values = np.array([max(action_values) for action_values in Q_target_values])


            #calculate the targets, don't add new Q_target_values if an environment just terminated
            target_q_values = rewards + GAMMA * max_Q_target_values * (1-terminateds)

            #training: do gradient descent of Q_theta in direction of target
            with tf.GradientTape() as tape:
                predictions = Q_theta(old_states)
                selected_q_values = tf.gather(predictions, actions, batch_dims=1) #predictions of Q(old_states, actions)
                loss_value = tf.reduce_mean(tf.square(selected_q_values-target_q_values)) #compute simple MSE loss
            gradients = tape.gradient(loss_value, Q_theta.trainable_variables) #get
            optimizer.apply_gradients(zip(gradients, Q_theta.trainable_variables)) #and apply gradients

    #every 10 steps, print log message and reset returns
    if STEPS % 10 == 0:
        if(return_tracker):
            print("average return since last print: " + str(np.mean(return_tracker)) + " in step " + str(STEPS))
            return_tracker = []
        else:
            print("no new returns in step " + str(STEPS))

average return since last print: -113.01152249009446 in step 10
average return since last print: -191.8076237943239 in step 20
average return since last print: -147.18086310215585 in step 30
average return since last print: -69.145859312376 in step 40
average return since last print: -149.24887289531358 in step 50
average return since last print: -161.71989557964716 in step 60
average return since last print: -122.84228365862855 in step 70
average return since last print: -123.73784293075593 in step 80
average return since last print: -184.92750585255058 in step 90
average return since last print: -106.50915910947009 in step 100
average return since last print: -115.1366378890804 in step 110
average return since last print: -215.10463943725333 in step 120
average return since last print: -244.99161285254715 in step 130
average return since last print: -179.30571684388218 in step 140
average return since last print: -247.72557568663305 in step 150
average return since last print: -222.0

In [11]:
Q_theta.save('final_model')

INFO:tensorflow:Assets written to: seventh_try/assets


In [4]:
#Q_theta = tf.keras.models.load_model("fourth_try")



2023-06-15 09:33:42.560151: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [12]:
test_env = gym.make('LunarLander-v2', render_mode='human')
#test_env = gym.make('CartPole-v1', render_mode='human')
obs, inf = test_env.reset()


In [19]:
for i in range(1000):
    qs = Q_theta(tf.expand_dims(obs, 0))
    act = np.argmax(qs)
    obs, _, terminated, _, _ = test_env.step(act)
    if(terminated):
        obs, _ = test_env.reset()

: 