Prioritized Experience Replay

In [25]:
import numpy as np
import os
import matplotlib.pyplot as plt
import enviroment_no_visual as enviroment_no_visual
import enviroment_visual as enviroment_visual
import tensorflow as tf
import keras
import random

Funzione per creare una neural network lineare 

In [26]:
def Linear_QNet(units):
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=[11]))
    model.add(keras.layers.Dense(units, activation='relu'))
    model.add(keras.layers.Dense(units=3))
    return model

Implementazione di un buffer circolare che permetta inserimento/cancellazione degli elementi e accesso random veloce. 

In [27]:
class PrioritizedReplayBuffer:
    def __init__(self, max_size, zeta=0.6):
        self.max_size = max_size
        self.zeta = zeta
        self.epsilon = 1e-6  # Piccolo valore per evitare divisioni per zero
        self.buffer = []
        self.priorities = []
        self.position = 0

    def add(self, error, experience):
        priority = self.get_priority(error)
        if len(self.buffer) < self.max_size:
            self.buffer.append(experience)
            self.priorities.append(priority)
        else:
            self.buffer[self.position] = experience
            self.priorities[self.position] = priority
        self.position = (self.position + 1) % self.max_size

    def get_priority(self, error):
        return (error + self.epsilon) ** self.zeta

    def slice_experiences(samples):
        states, actions, rewards, next_states, game_overs = [
            np.array([experience[field_index] for experience in samples])
                        for field_index in range(5)]
        return states, actions, rewards, next_states, game_overs

    def sample(self, batch_size, beta):
        priorities = np.array(self.priorities)
        probabilities = priorities / priorities.sum()
        
        indices = np.random.choice(len(self.buffer), batch_size, p=probabilities)
        samples = [self.buffer[idx] for idx in indices]
        
        sampling_probabilities = probabilities[indices]
        weights = np.power(len(self.buffer) * sampling_probabilities, -beta)
        weights /= weights.max()  # Normalizzazione dei pesi

        states, actions, rewards, next_states, game_overs = slice_experiences(samples)

        return states, actions, rewards, next_states, game_overs, indices, weights

    def update(self, indices, errors):
        for i, error in zip(indices, errors):
            self.priorities[i] = self._get_priority(error)

In [28]:
class QTrainer:
    def __init__(self, model, lr, gamma):
        self.online_model = model
        self.gamma = gamma
        self.optimizer = keras.optimizers.Adam(learning_rate=lr)
        
    @tf.function
    def train_step(self, states, actions, rewards, next_states, dones, weights):
        next_Q_values = self.model(next_states)
        max_next_Q_values = tf.reduce_max(next_Q_values, axis=1)
        # Equazione di Bellman: Q value = reward + discount factor * expected future reward
        target_Q_values = rewards + (1 - dones) * self.gamma * max_next_Q_values
        with tf.GradientTape() as tape:
            all_Q_values = self.model(states)  
            Q_values = tf.reduce_sum(all_Q_values * actions, axis=1, keepdims=True)
            td_errors = target_Q_values - Q_values
            loss = tf.reduce_mean(weights * tf.square(td_errors))
        # Backpropagation
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        errors = np.abs(td_errors.numpy())
        return errors

In [29]:
def convert_to_tensorflow(states, actions, rewards, next_states, dones):
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.float32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
    next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
    dones = tf.convert_to_tensor(dones, dtype=tf.float32)
    return states, actions, rewards, next_states, dones

In [30]:
class Agent:
    def __init__(self, lr, gamma, max_memory, batch_size, nn_model):
        self.n_games = 0
        self.epsilon = 1  
        self.lr = lr
        self.gamma = gamma  
        self.memory = PrioritizedReplayBuffer(max_size=max_memory)
        self.beta = 0.4
        self.batch_size = batch_size
        self.model = nn_model
        self.target_model = keras.models.clone_model(self.model)
        self.trainer = QTrainer(self.model, self.target_model, lr=self.lr, gamma=self.gamma)

    def remember(self, state, action, reward, next_state, done):
        self.memory.add(convert_to_tensorflow(state, action, reward, next_state, done))

    def train_memory(self, beta):
        states, actions, rewards, next_states, dones, indices, weights = self.memory.sample(self.batch_size, self.beta)
        errors = self.trainer.train_step(states, actions, rewards, next_states, dones, weights)
        self.replay_buffer.update(indices, errors)

    def epsilon_greedy_policy(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(3)
        else:
            Q_values = self.model(state[np.newaxis])
            return np.argmax(Q_values[0])
        
    def get_action(self, state):
        final_move = [0, 0, 0]  
        move = self.epsilon_greedy_policy(state)
        final_move[move] = 1
        return final_move
    
    def train_agent(self, N_GAME, visual=True):
        if visual:
            env = enviroment_visual.SnakeGameAI(speed=0)
        else:
            env = enviroment_no_visual.SnakeGameAI()
        score_list = []
        record = 0
        n_eps_zero = int(N_GAME*0.7)
        step=0
        while self.n_games < N_GAME:
            state_old = env.get_state()
            final_move = self.get_action(state_old)
            state_new, reward, done, score = env.play_step(final_move)
            self.remember(state_old, final_move, reward, state_new, done)

            if done:
                env.reset()
                self.n_games += 1
                print(f"\rGame: {self.n_games}, Epsilon: {self.epsilon:3f}, Score: {score}, Record: {record}, Step eseguiti: {step}.", end="")
                self.epsilon = max(((n_eps_zero - self.n_games) / n_eps_zero), 0)
                if len(self.memory.max_size) >= self.batch_size:
                    self.train_memory(beta)
                    beta = min(1.0, beta + (N_GAME - self.n_games)/N_GAME*0.6)
                if score > record:
                    record = score
                    self.save_model()
                score_list.append(score)
            step+=1
        return score_list

    def save_model(self, model_dir_path="./DQNmodel", file_name='model.keras'):
        if not os.path.exists(model_dir_path):
            print(f"La cartella non esiste. Sarà creata con nome: {model_dir_path}")
            os.mkdir(model_dir_path)
        file_name = os.path.join(model_dir_path, file_name)
        self.model.save(file_name)

Mostra andamento dello score per partita durante il training

In [31]:
def plot_trand(scores, save_path=None):
    zeros = [0 for i in range(49)]
    calcola_media = lambda i: sum(scores[i-50:i+1]) / 50
    media_precedenti = zeros + list(map(calcola_media, range(49, len(scores))))
    max_mean_value = max(media_precedenti)
    max_mean_index = media_precedenti.index(max_mean_value)
    plt.plot(scores, label='Score')
    plt.plot(media_precedenti, label='Mean score delle ultime 50 partite')
    plt.text(max_mean_index, max_mean_value, f'{max_mean_value:.2f}', fontsize=12, color="darkorange", ha='center')

    plt.title("Andamento del training.")
    plt.xlabel("Partite")
    plt.ylabel("Score")
    plt.legend()
    if save_path is not None:
        plt.savefig(save_path)
    plt.show()

Crea e allena un agente

In [32]:
online_model = Linear_QNet(128)
agent = Agent(lr=0.05, gamma=0.9, max_memory=100_000, batch_size=1024, nn_model=online_model)
plot_scores = agent.train_agent(N_GAME=500, visual=False)
plot_trand(plot_scores)

TypeError: PrioritizedReplayBuffer.add() missing 5 required positional arguments: 'state', 'action', 'reward', 'next_state', and 'done'