In [3]:
#!pip install gymnasium[atari]
#!pip install gymnasium[accept-rom-license]

In [4]:
#!pip install "gym[accept-rom-license, atari]"

In [5]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.layers import Conv2D, Flatten, Dense
from stable_baselines3.common.vec_env.vec_frame_stack import VecFrameStack
from stable_baselines3.common.env_util import make_atari_env
from keras.models import Sequential
import random
import os
import gc 
import os, psutil 
from tqdm import tqdm
import pandas as pd
from collections import deque

gamma = 0.99                                    # Discount factor for past rewards
num_actions = 5                                 # 4 directions + no-op

initial_sample_size = 1000                      # Number of steps to collect before starting training
batch_size = 32                                 # How many experiences to use for each training step
max_replay_size = 10000                         # The maximum size of the replay buffer
target_update_period = 100                      # The frequency with which the target network is updated
max_episode_rew_history = target_update_period  # How many past episodes to average over when calculating score

experience_replay = []                          # A buffer used to store past experience_replay
episode_rew_history = []                        # A buffer used to store past episode rewards
episode_count = 0                               # A counter for the number of episodes
episode_rew = 0                                 # The reward accumulated over the current episode
training_episodes = 10000                       # The number of episodes to train for
avg_episodes_rew = 0                            # The reward total of the last 100 episodes

#checkpoint_dir = '/kaggle/working/checkpoints'  # Where to save checkpoints
checkpoint_dir = './checkpoints'                 # Where to save checkpoints
checkpoint_every_n_update = 2                    # How often to checkpoint the model (in episodes)

2024-01-12 18:21:41.862348: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-12 18:21:41.862381: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-12 18:21:41.863738: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-12 18:21:41.870660: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class LinearIterator:
    def __init__(self, start, end, steps):
        self.start = start
        self.end = end
        self.steps = steps
    
    def value(self, step):
        return self.start + (self.end - self.start) * step / self.steps

In [6]:
def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    memory_percent = psutil.virtual_memory().percent
    return np.round(memory_use, 2), np.round(memory_percent, 2)

In [7]:
def get_train_fn():
    @tf.function
    def train_function(batch_state, action_mask, target, Q, loss_function, optimizer):
        with tf.GradientTape() as tape:
            q_pred = Q(batch_state)
            q_action = tf.reduce_sum(tf.multiply(q_pred, action_mask), axis=1)
            loss = loss_function(target, q_action)

        grads = tape.gradient(loss, Q.trainable_variables)
        optimizer.apply_gradients(zip(grads, Q.trainable_variables))
        return loss

    return train_function

In [8]:
# Creates a simple convolutional NN to work with the images
def create_q_nn(num_actions):
    model = Sequential([
        Conv2D(32, 8, strides=4, activation="relu", input_shape=(84, 84, 4)),
        Conv2D(64, 4, strides=2, activation="relu"),
        Conv2D(64, 3, strides=1, activation="relu"),
        Flatten(),
        Dense(512, activation="relu"),
        Dense(num_actions, activation="linear")
    ])

    return model

In [9]:
class CheckpointManager:
    def __init__(self, checkpoint_dir, optimizer, model):
        self.checkpoint_dir = checkpoint_dir
        self.checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
        self.checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model, last_episode_saved=tf.Variable(0, dtype=tf.int64))
            
        # Create the directory if it doesn't exist
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        # Restore the latest checkpoint if it exists
        if tf.train.latest_checkpoint(checkpoint_dir):
            self.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
            print("Restored from {}".format(tf.train.latest_checkpoint(checkpoint_dir)))
            print("Last episode saved: {}".format(self.checkpoint.last_episode_saved))
        else:
            print("Initializing from scratch.")
            

    def save_checkpoint(self, episode_count):
        print("Saving checkpoint...")
        self.checkpoint.last_episode_saved = tf.Variable(episode_count, dtype=tf.int64)
        
        self.checkpoint.save(file_prefix=self.checkpoint_prefix)
        tf.keras.backend.clear_session()
        self.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [10]:
import json
class ExperienceReplay:
    def __init__(self, max_replay_size, max_episode_rew_history, eps_it):
        self.max_replay_size = max_replay_size
        self.max_episode_rew_history = max_episode_rew_history
        self.eps_it = eps_it
        self.experiences = deque(maxlen=max_replay_size)
        self.episode_rew_history = deque(maxlen=max_episode_rew_history)

        self.checkpoint_rew = []
        self.episode_rew = 0

    def add_experience(self, episode_count, obs, env, Q):
        eps = self.eps_it.value(episode_count)
        if eps >= random.random():
            action = random.randint(0, 4)
        else:
            # We can also use numpy but this is more efficient
            tensor_state = tf.convert_to_tensor(obs)
            # Dimensions need to be expanded cause the model expects a batch/not a single element
            expanded_state = tf.expand_dims(tensor_state, axis=0)
            actions = Q(expanded_state, training=False)[0]
            action = tf.argmax(actions).numpy()

        # Made to work with only one environment because I want to use StackedFrames
        # and it doesn't work with non vectorized environments
        next_obs, rew, done, info = env.step([action])
        rew = rew[0]
        done = done[0]
        info = info[0]
        next_obs = next_obs.squeeze()

        self.episode_rew += rew
        
        if done:
            self.episode_rew_history.append(self.episode_rew)
            self.episode_rew = 0

        self.experiences.append((obs, action, rew, next_obs, done))

        return next_obs, rew, done, info

    def sample_batch(self, batch_size):
        batch_idx = random.sample(range(0, len(self.experiences) - 1), batch_size)

        batch_state = np.array([self.experiences[idx][0] for idx in batch_idx])
        batch_action = [self.experiences[idx][1] for idx in batch_idx]
        batch_rew = [self.experiences[idx][2] for idx in batch_idx]
        batch_next_state = np.array([self.experiences[idx][3] for idx in batch_idx])
        batch_done = tf.convert_to_tensor([float(self.experiences[idx][4]) for idx in batch_idx])

        return batch_state, batch_action, batch_rew, batch_next_state, batch_done

    def get_last_episode_rew(self):
        return self.episode_rew_history[-1]

    def get_episode_rew_history(self):
        return self.episode_rew_history
    
    def get_avg_episode_rew(self):
        return np.mean(self.episode_rew_history)
    
    def update_checkpoint_rew(self):
        self.checkpoint_rew.append(self.get_avg_episode_rew())

    def get_checkpoint_rew(self):
        return self.checkpoint_rew
    
    def save_checkpoint_rew(self, checkpoint_dir):
        # Save the checkpoint_rew list in a JSON file
        with open(checkpoint_dir + '/checkpoint_rew.json', 'w') as f:
            json.dump(list(self.checkpoint_rew), f)


In [11]:
# make_atari_env creates an environment which reduces image sizes
# clips rewards in the range of -1, 0, 1 and replaces RGB with grayscale
# VecFrameStack does 4 steps and stackes them on each other so we 
# can better train seeing how the Pacman moves and how the ghosts move
env = VecFrameStack(make_atari_env("ALE/MsPacman-v5"), n_stack=4)
obs = env.reset().squeeze()

# target fixed network
Q_target = create_q_nn(num_actions)

# network we train
Q = create_q_nn(num_actions)

train_fn = get_train_fn()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
loss_function = keras.losses.Huber()

eps_it = LinearIterator(1, 0.1, training_episodes)

checkpoint_manager = CheckpointManager(checkpoint_dir, optimizer, Q)

experience_replay = ExperienceReplay(max_replay_size, max_episode_rew_history, eps_it)

if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
    device = '/GPU:0'
else:
    print("GPU is not available, using CPU instead")
    device = '/CPU:0'

with tf.device(device):
    # Maybe change to MSE
    tf.keras.backend.clear_session()

    # Initial Warm Up
    print("Initial warm up:", initial_sample_size, "steps")
    for count in tqdm(range(initial_sample_size)):
        next_obs, _, _, _ = experience_replay.add_experience(count, obs, env, Q)
        obs = next_obs
    
    # Training
    print("Starting training")
    for count in range(training_episodes // target_update_period):

        pbar = tqdm(range(target_update_period), desc="Episode")
        for episode_count in pbar:

            # Done if episode is finished
            done = False

            # Repeat until episode is finished
            while not done:
                # New Experience
                next_obs, _, done, _ = experience_replay.add_experience(episode_count, obs, env, Q)
                
                # Sample a batch from the experience replay
                batch_state, batch_action, batch_rew, batch_next_state, batch_done = experience_replay.sample_batch(batch_size)

                # one hot encoded actions
                action_mask = tf.one_hot(batch_action, num_actions)
                target_val = Q_target.predict(batch_next_state, verbose=0)

                target = batch_rew + gamma * tf.reduce_max(target_val, axis=1)

                # set last value to -1 if we have terminated. The goal is to avoid getting killed
                target = target * (1 - batch_done) - batch_done

                # Actual training
                train_fn(batch_state, action_mask, target, Q, loss_function, optimizer)

                # Update the current observation
                obs = next_obs
            
            
            obs = env.reset().squeeze()
            episode_rew = experience_replay.get_last_episode_rew()
            pbar.set_postfix({"Last episode reward ": episode_rew})    
                
        gc.collect()

        avg_episodes_rew = experience_replay.get_avg_episode_rew()
        cpu_stat = cpu_stats()

        print("Running reward: {} | Memory in use: {} | Memory in use %: {}".format(avg_episodes_rew, cpu_stat[0], cpu_stat[1]))
        print()

        if count > 0 % checkpoint_every_n_update == 0:
            checkpoint_manager.save_checkpoint(episode_count)
            experience_replay.update_checkpoint_rew()

        Q_target.set_weights(Q.get_weights())

Q.save("./Q_model")
env.close()
print("Training finished!")

experience_replay.save_checkpoint_rew(checkpoint_dir)

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]
2024-01-12 18:21:52.919241: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Initializing from scratch.
GPU is not available, using CPU instead
Initial warm up: 1000 steps


100%|██████████| 1000/1000 [00:07<00:00, 135.28it/s]


Starting training


Episode: 100%|██████████| 10/10 [00:37<00:00,  3.77s/it, Last episode reward =2]


Running reward: 10.8 | Memory in use: 1.38 | Memory in use %: 52.8

Saving checkpoint...


Episode: 100%|██████████| 10/10 [00:53<00:00,  5.31s/it, Last episode reward =21]


Running reward: 14.6 | Memory in use: 1.47 | Memory in use %: 53.8



Episode: 100%|██████████| 10/10 [00:44<00:00,  4.44s/it, Last episode reward =13]


Running reward: 11.5 | Memory in use: 1.51 | Memory in use %: 56.6

Saving checkpoint...


Episode: 100%|██████████| 10/10 [00:46<00:00,  4.67s/it, Last episode reward =11]


Running reward: 13.6 | Memory in use: 1.54 | Memory in use %: 55.4



Episode: 100%|██████████| 10/10 [00:39<00:00,  3.93s/it, Last episode reward =10]


Running reward: 9.4 | Memory in use: 1.56 | Memory in use %: 55.8

Saving checkpoint...


Episode: 100%|██████████| 10/10 [00:44<00:00,  4.48s/it, Last episode reward =7]


Running reward: 11.7 | Memory in use: 1.56 | Memory in use %: 55.9



Episode:  60%|██████    | 6/10 [00:21<00:14,  3.60s/it, Last episode reward =7] 


KeyboardInterrupt: 