In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import gymnasium as gym
import tensorflow as tf
import numpy as np
import random


In [3]:
class Buffer:
    def __init__(self, max_size, NUM_ENVS, observation_preprocessing_function, EPSILON = 0.2,):
        self.buffer = []
        self.max_size = max_size
        self.num_envs = NUM_ENVS
        self.envs = envs = gym.vector.make('LunarLander-v2', num_envs=NUM_ENVS)#, render_mode='human')
        self.current_state, _ = self.envs.reset()
        self.epsilon = 0.2
        self.observation_preprocessing_function = observation_preprocessing_function
        self.unroll_steps = 5

        self.data = []

    def fill_with_samples(self, model):
        states_list = []
        actions_list = []
        rewards_list = []
        subsequent_states_list = []
        termintateds_list = []


        for i in range(self.unroll_steps):
            observations = self.observation_preprocessing_function(self.current_state) # ???
            q_values = model(observations) #get q values for current state
            action = np.argmax(q_values, axis=1) #get action that maximizes q-value #potential error?
            action = [random.choice(range(4)) if random.random() < self.epsilon else a for a in action] #choose epsilon greedy
            new_observation, reward, terminated, truncated, info = self.envs.step(action)

            states_list.append(self.current_state)
            actions_list.append(action)
            rewards_list.append(reward)
            subsequent_states_list.append(new_observation)
            termintateds_list.append(terminated)

            self.current_state = new_observation


        def data_generator():
            for states_batch, actions_batch, rewards_batch, subsequent_states_batch, terminateds_batch in zip(states_list, actions_list, rewards_list, subsequent_states_list, termintateds_list):
                for game in range(self.num_envs):
                    state = states_batch[game, :]
                    action = actions_batch[game]
                    reward = rewards_batch[game]
                    subsequent_state = subsequent_states_batch[game, :]
                    terminated = terminateds_batch[game]
                    yield (state, action, reward, subsequent_state, terminated)


        sig = (tf.TensorSpec(shape=(8), dtype = tf.float32), tf.TensorSpec(shape=(), dtype = tf.int32), tf.TensorSpec(shape=(), dtype = tf.float32), tf.TensorSpec(shape=(8), dtype = tf.float32), tf.TensorSpec(shape=(), dtype = tf.bool))
        new_samples_dataset = tf.data.Dataset.from_generator(data_generator, output_signature = sig)
        #print(new_samples_dataset)
        #r = lambda _state, _action, _reward, _nextstate, _terminated: self.observation_preprocessing_function(_state), _action, _reward, self.observation_preprocessing_function(_nextstate), _terminated
        #new_samples_dataset = new_samples_dataset.map(r)
        #new_samples_dataset = new_samples_dataset.prefetch(buffer_size = self.unroll_steps*self.num_envs).cache().shuffle(buffer_size = self.unroll_steps*self.num_envs, reshuffle_each_iteration=True)
        new_samples_dataset = new_samples_dataset.cache().shuffle(buffer_size = self.unroll_steps*self.num_envs, reshuffle_each_iteration=True)


        for elem in new_samples_dataset:
            continue

        self.data.append(new_samples_dataset)

        datapoints_in_data = len(self.data)*self.unroll_steps*self.num_envs
        if datapoints_in_data > self.max_size:
            self.data.pop(0)

    def create_dataset(self):
        erp_dataset = tf.data.Dataset.sample_from_datasets(self.data, stop_on_empty_dataset=False)
        #if erp_dataset.cardinality().numpy() < 2:
            # Add another sample from the datasets
        #    additional_sample = tf.data.Dataset.sample_from_datasets(self.data, stop_on_empty_dataset=False)
        #    erp_dataset = erp_dataset.concatenate(additional_sample)

        return erp_dataset
        

In [4]:
def observation_preprocessing_function(observation):
    return observation

In [5]:
def create_model():
    inputs = tf.keras.Input(shape=(8,))
    x = tf.keras.layers.Dense(100, activation="relu")(inputs)
    x = tf.keras.layers.Dense(50, activation="relu")(x)
    x = tf.keras.layers.Dense(32, activation="relu")(x)
    outputs = tf.keras.layers.Dense(4, activation="relu")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="LunarLander")
    return model

In [6]:
def train_dqn(train_dqn_network, target_dqn, dataset_, optimizer, num_training_steps, gamma):
    def training_step(q_target, observations, actions):
        with tf.GradientTape() as tape:
            predictions = train_dqn_network(observations)
            #print(predictions)
            selected_q_values = tf.gather(predictions, actions, batch_dims=1)
            loss_value = tf.reduce_mean(tf.square(q_target - selected_q_values))
        gradients = tape.gradient(loss_value, train_dqn_network.trainable_variables)
        optimizer.apply_gradients(zip(gradients, train_dqn_network.trainable_variables))
        return loss_value.numpy()
    

    losses = []
    q_values = []
    #print("dataset in train function:", dataset_)
    for i, state_transition in enumerate(dataset_):
        state, action, reward, subsequent_state, terminated = state_transition
        #print(tf.expand_dims(subsequent_state, 0))
        q_vals = target_dqn(tf.expand_dims(subsequent_state, 0))
        q_values.append(q_vals.numpy())
        max_q_values = tf.reduce_max(q_vals, axis = 1)
        use_subsequent_state = tf.where(terminated, tf.zeros_like(max_q_values, dtype=tf.float32), tf.ones_like(max_q_values, dtype=tf.float32))
        q_target = reward + (gamma*max_q_values*use_subsequent_state)
        loss = training_step(q_target, tf.expand_dims(state, 0), tf.expand_dims(action,0))
        losses.append(loss)
        if i>=num_training_steps:
            break
    return np.mean(losses), np.mean(q_values)

In [7]:
def polyak_averaging_weights(source_network, target_network, polyak_averaging_factor):
    source_network_weights = source_network.get_weights()
    target_network_weights = target_network.get_weights()
    averaged_weights = []
    for source_network_weight, target_network_weight in zip(source_network_weights, target_network_weights):
        fraction_kept_weights = polyak_averaging_factor * target_network_weight
        fraction_updated_weights = (1-polyak_averaging_factor) * source_network_weight
        average_weight = fraction_kept_weights * fraction_updated_weights
        averaged_weights.append(average_weight)
    target_network.set_weights(averaged_weights)
    

In [8]:
def dqn():
    BUFFER_MAX_SIZE = 100000
    NUM_ENVS = 5
    K_STEPS_PER_ITER = 4
    NUM_TRAINING_ITERS = 4000
    TEST_EVERY_N_STEPS = 100
    POLYAK_AVERAGING_FACTOR = 0.02
    
    erp = Buffer(BUFFER_MAX_SIZE, NUM_ENVS, observation_preprocessing_function)
    dqn_agent = create_model()
    target_network = create_model()
    dqn_agent(tf.random.uniform(shape=(1,8))) #??

    
    dqn_optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
    GAMMA = 0.98


    return_tracker = []
    dqn_prediction_error = []
    average_q_values = []

    for step in range(NUM_TRAINING_ITERS):
        erp.fill_with_samples(dqn_agent)
        dataset = erp.create_dataset()
        #print("dataset", dataset)
        avg_loss, avg_q_vals = train_dqn(dqn_agent, target_network, dataset, dqn_optimizer, K_STEPS_PER_ITER, GAMMA)
        polyak_averaging_weights(dqn_agent, target_network, POLYAK_AVERAGING_FACTOR)

        if step % TEST_EVERY_N_STEPS == 0:
            #dqn_prediciton_error.append(avg_loss)
            #average_q_values.append(avg_q_vals)
            #print("avg_loss", avg_loss)
            #print("avg_q_vals", avg_q_vals)
            print(step)
            dqn_agent.save('some_other_model')
    return dqn_agent


In [9]:
Q_theta = dqn() #this just learns zeros

0
INFO:tensorflow:Assets written to: some_other_model/assets
100
INFO:tensorflow:Assets written to: some_other_model/assets
200
INFO:tensorflow:Assets written to: some_other_model/assets
300
INFO:tensorflow:Assets written to: some_other_model/assets
400
INFO:tensorflow:Assets written to: some_other_model/assets
500
INFO:tensorflow:Assets written to: some_other_model/assets
600
INFO:tensorflow:Assets written to: some_other_model/assets
700
INFO:tensorflow:Assets written to: some_other_model/assets
800
INFO:tensorflow:Assets written to: some_other_model/assets
900
INFO:tensorflow:Assets written to: some_other_model/assets
1000
INFO:tensorflow:Assets written to: some_other_model/assets
1100
INFO:tensorflow:Assets written to: some_other_model/assets
1200
INFO:tensorflow:Assets written to: some_other_model/assets
1300
INFO:tensorflow:Assets written to: some_other_model/assets
1400
INFO:tensorflow:Assets written to: some_other_model/assets


KeyboardInterrupt: 

In [4]:
Q_theta = tf.keras.models.load_model("not_working_model")



In [5]:
test_env = gym.make('LunarLander-v2', render_mode='human')
obs, inf = test_env.reset()

In [7]:
for i in range(1):
    qs = Q_theta(tf.expand_dims(obs, 0))
    print(qs)
    act = np.argmax(qs)
    obs, _, terminated, _, _ = test_env.step(act)
    if(terminated):
        obs, _ = test_env.reset()

tf.Tensor([[0. 0. 0. 0.]], shape=(1, 4), dtype=float32)


: 