In [1]:
!pip install gymnasium[atari]

Collecting gymnasium[atari]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[atari])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting shimmy[atari]<1.0,>=0.1.0 (from gymnasium[atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl (25 kB)
Collecting ale-py~=0.8.1 (from shimmy[atari]<1.0,>=0.1.0->gymnasium[atari])
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: farama-notifications, gymnasium, ale-py, shimmy
Successfully installed ale-py-0.8.1 farama-notifications-0.0.4 gymnasium-0.29.1 shimmy-0.2.1


In [2]:
!pip install gymnasium[accept-rom-license]

Collecting autorom[accept-rom-license]~=0.4.2 (from gymnasium[accept-rom-license])
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.4.2->gymnasium[accept-rom-license])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: AutoROM.accept-rom-license
  Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... [?25l[?25hdone
  Created wheel for AutoROM.accept-rom-license: filename=AutoROM.accept_rom_license-0.6.1-py3-none-any.whl size=446659 sha256=fef7dae5d4cdc5eec4d6aefba113a241e057e4bbc81b44ae6d8bca39a41e5d33
  Stored in directory: /root/.cache/pip/wheels/

In [3]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow import reduce_mean


class DQN:
    def __init__(self, state_space_shape, num_actions, model, target_model, learning_rate=0.1,
                 discount_factor=0.95, batch_size=16, memory_size=100):
        """
        Initializes Deep Q Network agent.
        :param state_space_shape: shape of the observation space
        :param num_actions: number of actions
        :param model: Keras model
        :param target_model: Keras model
        :param learning_rate: learning rate
        :param discount_factor: discount factor
        :param batch_size: batch size
        :param memory_size: maximum size of the experience replay memory
        """
        self.state_space_shape = state_space_shape
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        self.model = model
        self.target_model = target_model
        self.update_target_model()

    def update_memory(self, state, action, reward, next_state, done):
        """
        Adds experience tuple to experience replay memory.
        :param state: current state
        :param action: performed action
        :param reward: reward received for performing action
        :param next_state: next state
        :param done: if episode has terminated after performing the action in the current state
        """
        self.memory.append((state, action, reward, next_state, done))

    def update_target_model(self):
        """
        Synchronize the target model with the main model.
        """
        print('Updating target...')
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state, epsilon):
        """
        Returns the best action following epsilon greedy policy for the current state.
        :param state: current state
        :param epsilon: exploration rate
        :return:
        """
        probability = np.random.random() + epsilon / self.num_actions
        if probability < epsilon:
            return np.random.randint(0, self.num_actions)
        else:
            if isinstance(self.state_space_shape, tuple):
                state = state.reshape((1,) + self.state_space_shape)
            else:
                state = state.reshape(1, self.state_space_shape)
            return np.argmax(self.model.predict(state, verbose=0)[0])

    def load(self, model_name, episode):
        """
        Loads the weights of the model at specified episode checkpoint.
        :param model_name: name of the model
        :param episode: episode checkpoint
        """
        self.model.load_weights(f'dqn_{model_name}_{episode}.h5')

    def save(self, model_name, episode):
        """
        Stores the weights of the model at specified episode checkpoint.
        :param model_name: name of the model
        :param episode: episode checkpoint
        """
        self.model.save_weights(f'dqn_{model_name}_{episode}.h5')

    def train(self):
        """
        Performs one step of model training.
        """
        batch_size = min(self.batch_size, len(self.memory))
        minibatch = random.sample(self.memory, batch_size)

        if isinstance(self.state_space_shape, tuple):
            states = np.zeros((batch_size,) + self.state_space_shape)
        else:
            states = np.zeros((batch_size, self.state_space_shape))
        actions = np.zeros((batch_size, self.num_actions))

        for i in range(len(minibatch)):
            state, action, reward, next_state, done = minibatch[i]
            if done:
                max_future_q = reward
            else:
                if isinstance(self.state_space_shape, tuple):
                    next_state = next_state.reshape((1,) + self.state_space_shape)
                else:
                    next_state = next_state.reshape(1, self.state_space_shape)
                max_future_q = (reward + self.discount_factor *
                                np.amax(self.target_model.predict(next_state, verbose=0)[0]))
            if isinstance(self.state_space_shape, tuple):
                state = state.reshape((1,) + self.state_space_shape)
            else:
                state = state.reshape(1, self.state_space_shape)
            target_q = self.model.predict(state, verbose=0)[0]
            target_q[action] = max_future_q
            states[i] = state
            actions[i] = target_q

        print('Training step...')
        self.model.train_on_batch(states, actions)

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError


def build_model(state_space_shape, num_actions, learning_rate):
    model = Sequential()
    model.add(Conv2D(32, (2, 2), activation='relu', input_shape=state_space_shape))
    model.add(Conv2D(16, (2, 2), activation='relu'))
    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dense(num_actions, activation='linear'))
    model.compile(Adam(lr=learning_rate), loss=MeanSquaredError())
    return model

In [5]:
import numpy as np


def preprocess_state(state):
    state_p = np.array(state, dtype=np.float64)
    state_p /= 255
    return state_p

In [6]:
import numpy as np

def preprocess_reward(reward):
    return np.clip(reward, -1000., 1000.)

In [7]:
import gymnasium as gym

In [8]:
env = gym.make('ALE/MsPacman-v5')

  and should_run_async(code)


In [9]:
state_space_shape = env.observation_space.shape[:-1] + (3,)
num_actions = env.action_space.n

In [10]:
num_episodes = 10
learning_rate = 0.01
discount_factor = 1.0
epsilon = 1.0
epsilon_decay = 0.995
min_epsilon = 0.1
batch_size = 8
memory_size = 1000

In [11]:
model = build_model(state_space_shape, num_actions, learning_rate)
target_model = build_model(state_space_shape, num_actions, learning_rate)



In [12]:
agent = DQN(state_space_shape, num_actions, model, target_model, learning_rate,
            discount_factor, batch_size, memory_size)

Updating target...


In [13]:
for episode in range(num_episodes):
    state, _ = env.reset()
    state = preprocess_state(state)
    done = False
    rewards = 0
    processed_rewards = 0
    steps = 0
    while not done:
        action = agent.get_action(state, epsilon)
        new_state, reward, done, _, _ = env.step(action)
        new_state = preprocess_state(new_state)
        processed_reward = preprocess_reward(reward)
        agent.update_memory(state, action, processed_reward, new_state, done)
        state = new_state
        rewards += reward
        processed_rewards += processed_reward
        steps += 1
    agent.train()
    print(f'Episode: {episode}, Original reward: {rewards}, Processed reward: {processed_rewards}, Steps: {steps}, Epsilon: {epsilon}')
    if epsilon > min_epsilon:
        epsilon *= epsilon_decay
    if episode % 5 == 0:
        agent.update_target_model()

Training step...
Episode: 0, Original reward: 380.0, Processed reward: 380.0, Steps: 701, Epsilon: 1.0
Updating target...
Training step...
Episode: 1, Original reward: 300.0, Processed reward: 300.0, Steps: 395, Epsilon: 0.995
Training step...
Episode: 2, Original reward: 220.0, Processed reward: 220.0, Steps: 403, Epsilon: 0.990025
Training step...
Episode: 3, Original reward: 150.0, Processed reward: 150.0, Steps: 451, Epsilon: 0.985074875
Training step...
Episode: 4, Original reward: 660.0, Processed reward: 660.0, Steps: 825, Epsilon: 0.9801495006250001
Training step...
Episode: 5, Original reward: 170.0, Processed reward: 170.0, Steps: 401, Epsilon: 0.9752487531218751
Updating target...
Training step...
Episode: 6, Original reward: 320.0, Processed reward: 320.0, Steps: 559, Epsilon: 0.9703725093562657
Training step...
Episode: 7, Original reward: 200.0, Processed reward: 200.0, Steps: 469, Epsilon: 0.9655206468094844
Training step...
Episode: 8, Original reward: 200.0, Processed 

In [14]:
done = False
state, _ = env.reset()
state = preprocess_state(state)
# env.render()
step = 0
while not done:
    action = agent.get_action(state, min_epsilon)
    state, reward, done, _, _ = env.step(action)
    state = preprocess_state(state)
    reward = preprocess_reward(reward)
    # env.render()
    print(f'Step: {step}, Reward: {reward}')
    step += 1

Step: 0, Reward: 0.0
Step: 1, Reward: 0.0
Step: 2, Reward: 0.0
Step: 3, Reward: 0.0
Step: 4, Reward: 0.0
Step: 5, Reward: 0.0
Step: 6, Reward: 0.0
Step: 7, Reward: 0.0
Step: 8, Reward: 0.0
Step: 9, Reward: 0.0
Step: 10, Reward: 0.0
Step: 11, Reward: 0.0
Step: 12, Reward: 0.0
Step: 13, Reward: 0.0
Step: 14, Reward: 0.0
Step: 15, Reward: 0.0
Step: 16, Reward: 0.0
Step: 17, Reward: 0.0
Step: 18, Reward: 0.0
Step: 19, Reward: 0.0
Step: 20, Reward: 0.0
Step: 21, Reward: 0.0
Step: 22, Reward: 0.0
Step: 23, Reward: 0.0
Step: 24, Reward: 0.0
Step: 25, Reward: 0.0
Step: 26, Reward: 0.0
Step: 27, Reward: 0.0
Step: 28, Reward: 0.0
Step: 29, Reward: 0.0
Step: 30, Reward: 0.0
Step: 31, Reward: 0.0
Step: 32, Reward: 0.0
Step: 33, Reward: 0.0
Step: 34, Reward: 0.0
Step: 35, Reward: 0.0
Step: 36, Reward: 0.0
Step: 37, Reward: 0.0
Step: 38, Reward: 0.0
Step: 39, Reward: 0.0
Step: 40, Reward: 0.0
Step: 41, Reward: 0.0
Step: 42, Reward: 0.0
Step: 43, Reward: 0.0
Step: 44, Reward: 0.0
Step: 45, Reward: 0.