In [1]:
import gymnasium as gym
import tensorflow as tf
from tensorflow.keras.regularizers import l2
import numpy as np
import random

# Define the CNN model
def create_model(regularization_factor):
    #inputs = tf.keras.Input(shape=(4,))
    inputs = tf.keras.Input(shape=(8,)) #changed
    x = tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor))(inputs)
    x = tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor))(x)
    x = tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor))(x)
    #outputs = tf.keras.layers.Dense(2, activation="relu")(x)
    outputs = tf.keras.layers.Dense(4, activation="relu")(x) #changed
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="LunarLander")
    return model

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)



2023-06-14 18:07:30.637080: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-14 18:07:31.811717: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [2]:
class Environment:
    def __init__(self, NUM_ENVS):
        self.num_envs = NUM_ENVS
        #self.envs = envs = gym.vector.make('CartPole-v1', num_envs=NUM_ENVS)#, render_mode='human') #changed
        self.envs = envs = gym.vector.make('LunarLander-v2', num_envs=NUM_ENVS)
        self.current_state, _ = self.envs.reset()

    def sample(self, model, epsioln):
        q_values = model(self.current_state) #get q values for current state
        action = np.argmax(q_values, axis=1) #get action that maximizes q-value #potential error?
        action = [random.choice(range(4)) if random.random() < epsilon else a for a in action] #choose epsilon greedy #changed
        new_observation, reward, terminated, truncated, info = self.envs.step(action)
        old_observation = self.current_state

        self.current_state = new_observation
        return (old_observation, action, reward, new_observation, terminated)


In [3]:
class Buffer:
    def __init__(self, max_size):
        self.buffer = []
        self.max_size = max_size

    def add_to_buffer(self, samples):
        #unpack the different environments
        old_obs_ = samples[0]
        actions_ = samples[1]
        rewards_ = samples[2]
        new_obs_ = samples[3]
        terminateds_ = samples[4]
        for o, a, r, no, t in zip(old_obs_, actions_, rewards_, new_obs_, terminateds_):
            self.buffer.append((o, a, r, no, t))
        if len(self.buffer) > self.max_size:
            self.buffer = self.buffer[-self.max_size:]

    def sample_minibatch(self, batch_size):
        r = random.sample(self.buffer, min(len(self.buffer), batch_size))
        return r
        

In [4]:
TAU = 0.01
N = 50
K = 3
MINI_BATCH_SIZE = 64
CONVERGED = False
#MAX_STEPS = 2000#0
MAX_STEPS = 300
GAMMA = 0.99
regularization_factor = 0.001
epsilon = 1.0
minimum_epsilon = 0.01
epsilon_decay = 0.99


In [5]:

envs = Environment(7)

#just some approximatory metrics
returns = np.zeros(7)
return_tracker = []

# Create the model
Q_theta = create_model(regularization_factor)
Q_target = tf.keras.models.clone_model(Q_theta)

kabuff = Buffer(100000)

STEPS = 0

while not CONVERGED and STEPS < MAX_STEPS:
    STEPS += 1
    
    Q_target.set_weights(Q_theta.get_weights())
    
    epsilon = max(minimum_epsilon, epsilon*epsilon_decay) 
    for n in range(N):
        
        new_samples = envs.sample(Q_theta, epsilon) #sample from the environment
        
        #update return-metrics, reset returns if terminated
        #envs.sample returns a tuple of NUM_ENVS long np.arrays corresponding to old_obs, actions, rewards, new_obs, terminated
        #we add the rewards to our return counters
        returns += new_samples[2] #new_samples[2] = rewards
        for i, t in enumerate(new_samples[4]): #new_samples[4] = terminateds
            if t:
                #environment terminated, add to tracker and reset
                return_tracker.append(returns[i])
                returns[i] = 0

        kabuff.add_to_buffer(new_samples) #add environment samples to the buffer

        for k in range(K):
            #sample s,a,r,s' minibatch from buffer
            minibatch = kabuff.sample_minibatch(MINI_BATCH_SIZE)

            new_states = [sample[3] for sample in minibatch]
            rewards = [sample[2] for sample in minibatch]
            actions = [sample[1] for sample in minibatch]
            old_states = [sample[0] for sample in minibatch]
            terminateds = [sample[4] for sample in minibatch]

            new_states = np.array(new_states)
            rewards = np.array(rewards)
            actions = np.array(actions)
            old_states = np.array(old_states)
            terminateds = np.array(terminateds)
            #everything seems fine

            Q_target_values = Q_target(new_states)
            #print("new states:", new_states)
            #print("Q_target_values", Q_target_values)
            #print("----------------------------------------------------------------------")
            #weirdly learns many zeros
            max_Q_target_values = np.array([max(action_values) for action_values in Q_target_values])
            target_q_values = rewards + GAMMA * max_Q_target_values #this calculation works

            for i, tqv in enumerate(target_q_values):
                if terminateds[i]:
                    target_q_values[i] = rewards[i]

            with tf.GradientTape() as tape:
                predictions = Q_theta(old_states)
                #print(predictions)
                selected_q_values = tf.gather(predictions, actions, batch_dims=1)
                loss_value = tf.reduce_mean(tf.square(selected_q_values-target_q_values))
            gradients = tape.gradient(loss_value, Q_theta.trainable_variables)
            optimizer.apply_gradients(zip(gradients, Q_theta.trainable_variables))
    
    if STEPS % 10 == 0:
        if(return_tracker):
            print("average return since last print: " + str(np.mean(return_tracker)) + " in step " + str(STEPS))
            return_tracker = []
        else:
            print("no new returns in step " + str(STEPS))

average return since last print: -203.92052794315236 in step 10
average return since last print: -167.68321464034344 in step 20
average return since last print: -171.57197403436402 in step 30
average return since last print: -129.9988406321066 in step 40
average return since last print: -142.28509053195347 in step 50
average return since last print: -159.69538859004027 in step 60
average return since last print: -151.00913559181242 in step 70
average return since last print: -130.30208402133323 in step 80
average return since last print: -132.02023475880023 in step 90
average return since last print: -127.34682996363509 in step 100
average return since last print: -137.2870202731709 in step 110
average return since last print: -146.97277641179255 in step 120
average return since last print: -148.20637640478202 in step 130
average return since last print: -137.38321912007072 in step 140
average return since last print: -142.78944835684715 in step 150
average return since last print: -15

In [6]:
Q_theta.save('fith_try')

INFO:tensorflow:Assets written to: fith_try/assets


In [7]:
Q_theta = tf.keras.models.load_model("fourth_try")

OSError: No file or directory found at fourth_try

In [7]:
test_env = gym.make('LunarLander-v2', render_mode='human')
#test_env = gym.make('CartPole-v1', render_mode='human')
obs, inf = test_env.reset()


In [9]:
for i in range(1000):
    qs = Q_theta(tf.expand_dims(obs, 0))
    act = np.argmax(qs)
    obs, _, terminated, _, _ = test_env.step(act)
    if(terminated):
        obs, _ = test_env.reset()

: 