## Lab 1 Space Invaders
- Using Reinforcement Learning / Deep Q-Learning to train a model to play Space Invaders, originally an Atari 2600 game from 1980.  
- Gymnasium library is used. It emulates the environment and provided interfaces to control the game.  
- Reinforcement Learning and how it works with the chosen library:
Action space:
Observation space:
Steps / Actions:
Rewards:
- Using Google Colab for the learning process:

In [1]:
import keras
from keras import layers
import gymnasium as gym
import ale_py
from gymnasium.wrappers.atari_preprocessing import AtariPreprocessing
from gymnasium.wrappers.frame_stack import FrameStack
import numpy as np
import tensorflow as tf
from datetime import datetime

gym.register_envs(ale_py)

In [30]:
env = gym.make('SpaceInvadersNoFrameskip-v4', render_mode='rgb_array') # render_mode='rbg_array' för att få ut videos
env = AtariPreprocessing(env) #standardiserar till 84 x 84 pixlar och gråskala
env = FrameStack(env, num_stack=4) # staplar 4 frames för att ge agenten en uppfattning om rörelse

num_actions = env.action_space.n #antal möjliga actions

# för att spara video
nbr_episodes = 25 # antal episoder som ska gå innan man spelar in en video
trigger = lambda t : t % nbr_episodes == 0 # trigger för när man ska spela in video
env = gym.wrappers.RecordVideo(env, video_folder = './space_videos_3', episode_trigger=trigger, disable_logger=True)

In [22]:
#help(env)
print(env.action_space)        # Check the action space
print(env.unwrapped.get_action_meanings()) # Check what each action does
print(env.observation_space)   # Check the observation space
print(env.metadata)            # Check environment metadata


Discrete(6)
['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
Box(0, 255, (8, 84, 84), uint8)
{'render_modes': ['human', 'rgb_array'], 'render_fps': 30}


In [29]:
def create_q_model():
    return keras.Sequential(
        [
            layers.Input(shape=(84, 84, 4)),   # Input layer
            layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear")
        ]
    )


In [24]:
# Ladda en modell från disk
model = keras.models.load_model('keras_models_2\\spaceinvaders_model_4136.keras')
model_target = keras.models.load_model('keras_models_2\\spaceinvaders_model_4136.keras')

#skapa en ny modell
# model = create_q_model()
# model_target = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

In [25]:
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0

gamma = 0.99
epsilon = 0.1
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (
    epsilon_max - epsilon_min
)
batch_size = 32

# maximum episodes to run
max_episodes = 10000 # 500
# max frames to run
max_frames = 1e7
# Number of frames to take random action and observe output
epsilon_random_frames = 0#50000 #1000
# Number of frames for exploration
epsilon_greedy_frames = 100000.0 #1000000.0
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000 #1000000

# kolla förel
max_steps_per_episode = 10000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 10000 #1000
# Using huber loss for stability
loss_function = keras.losses.Huber()

In [28]:
while True:
    observation, _ = env.reset()
    state = np.array(observation)
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            # take random action
            action = env.action_space.sample()
        else:
            state_tensor = keras.ops.convert_to_tensor(state)
            state_tensor = keras.ops.expand_dims(state_tensor, 0)
            state_tensor = keras.ops.transpose(state_tensor, [0, 2, 3, 1])
            action_probs = model(state_tensor, training=False)
            # Take best action
            action = keras.ops.argmax(action_probs[0].numpy())

        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        state_next, reward, done, _ , _ = env.step(action)
        state_next = np.array(state_next)
        episode_reward += reward

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next
    
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            # get indices of smaples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)
            # Using list comprehension to sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            #print(f"State_next_sample.shape : {state_next_sample.shape}")
            reward_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = keras.ops.convert_to_tensor([float(done_history[i]) for i in indices])

            state_next_sample = keras.ops.convert_to_tensor(state_next_sample)
            #state_next_sample_tensor = keras.ops.expand_dims(state_next_sample, 0)
            state_next_sample = keras.ops.transpose(state_next_sample, [0, 2, 3, 1])
            future_rewards = model_target.predict(state_next_sample, verbose=0) # predikterar framtida rewards

            updated_q_values = reward_sample + gamma * keras.ops.amax(future_rewards, axis=1) # uppdaterar q-värden
            # if final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
            # create a mask so we only calculate loss on the updated q values
            masks = keras.ops.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # training the model on the states and updated q-values
                state_sample = keras.ops.transpose(state_sample, [0, 2, 3, 1])
                q_values = model(state_sample)
                q_action = keras.ops.sum(keras.ops.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)
                # backpropagation:
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())
            # Log details
            print(f"Episode: {episode_count}, best score of last 100 episodes: {np.max(episode_reward_history)}, running_reward: {running_reward}, frame_count: {frame_count}")       
            model.save(f"keras_models_3/spaceinvaders_model_{episode_count}.keras")

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 800:
        print(f"Solver at epiode{episode_count}")
        model.save(f"keras_models/breakout_model_{episode_count}.keras")
        break
    if (max_episodes > 0 and episode_count >= max_episodes):
        print(f"stopped at episode {episode_count}")
        model.save(f"keras_models/breakout_model_{episode_count}.keras")
        break
    if (max_frames > 0 and frame_count >= max_frames):
        print(f"stopped at frame {frame_count}")
        model.save(f"keras_models/breakout_model_{episode_count}.keras")
        break
        

ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "conv2d_18" is incompatible with the layer: expected axis -1 of input shape to have value 4, but received input with shape (1, 84, 84, 8)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1, 84, 84, 8), dtype=uint8)
  • training=False
  • mask=None