In [308]:
import numpy as np
import os
import matplotlib.pyplot as plt
import enviroment_no_visual as enviroment_no_visual
import enviroment_visual as enviroment_visual
import tensorflow as tf
import keras

Implementazione di un buffer circolare che permetta inserimento/cancellazione degli elementi e accesso random veloce. 

In [309]:
class PrioritizedReplayBuffer:
    def __init__(self, max_size, zeta=0.6):
        self.max_size = max_size
        self.buffer = []
        self.zeta = zeta
        self.priorities =  np.zeros((max_size,), dtype=np.float32)
        self.index = 0

    def append(self, experience):
        max_priority = self.priorities.max() if self.buffer else 1.0
        if len(self.buffer) < self.max_size:
            self.buffer.append(experience)
        else:
            self.buffer[self.index] = experience
        self.priorities[self.index] = max_priority
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.max_size:
            priorities = self.priorities
        else:
            priorities = self.priorities[:self.index]
        
        probabilities = priorities ** self.zeta
        probabilities /= probabilities.sum()
        indices = np.random.choice(len(self.buffer), batch_size, p=probabilities)
        samples = [self.buffer[idx] for idx in indices]
        
        total = len(self.buffer)
        sampling_probabilities = probabilities[indices]
        weights = (total * sampling_probabilities) ** (-beta)
        weights /= weights.max()

        states, actions, rewards, next_states, dones = map(np.array, zip(*samples))
        return states, actions, rewards, next_states, dones, indices, weights


    def update_priorities(self, batch_indices, batch_errors):
        for i, error in zip(batch_indices, batch_errors):
            self.priorities[i] = np.abs(error) + 1e-5

Funzione per creare una neural network lineare 

In [310]:
def CNN_DQN():
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(20, 20, 1)),  
        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(3, activation='linear')
    ])
    return model

In [311]:
model = CNN_DQN()
model.summary()

In [312]:
class DQNetwork:
    def __init__(self, lr, gamma):
        self.online_model = CNN_DQN()
        self.target_model = keras.models.clone_model(self.online_model)
        self.target_model.set_weights(self.online_model.get_weights())
        self.gamma = gamma
        self.optimizer = keras.optimizers.Adam(learning_rate=lr)
        self.loss_fn = keras.losses.mean_squared_error

    @tf.function
    def train_step(self, states, actions, rewards, next_states, dones, weights):
        next_Q_values = self.online_model(next_states)
        # Double DQN: l'online model sceglie l'azione dei prossimi stati ma i Q-Value sono stimati da target_model
        best_next_actions = tf.argmax(next_Q_values, axis=1)
        mask_for_target = tf.one_hot(best_next_actions, 3)
        max_next_Q_values = tf.reduce_sum(self.target_model(next_states) * mask_for_target, axis=1)
        # Equazione di Bellman: Q value = reward + discount factor * expected future reward
        target_Q_values = rewards + (1 - dones) * self.gamma * max_next_Q_values
        with tf.GradientTape() as tape:
            all_Q_values = self.online_model(states)  
            Q_values = tf.reduce_sum(all_Q_values * actions, axis=1, keepdims=False)
            loss = tf.reduce_mean(weights * self.loss_fn(target_Q_values, Q_values))
        # Backpropagation
        grads = tape.gradient(loss, self.online_model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.online_model.trainable_variables))
        td_errors = tf.abs(tf.subtract(target_Q_values, Q_values))
        return td_errors
    
    def update_weights(self):
        self.target_model.set_weights(self.online_model.get_weights())

    def save_model(self, model_dir_path="./DQNmodel/CNN", file_name='model.keras'):
        if not os.path.exists(model_dir_path):
            print(f"La cartella non esiste. Sarà creata con nome: {model_dir_path}")
            os.mkdir(model_dir_path)
        file_name = os.path.join(model_dir_path, file_name)
        self.model.save(file_name)

In [313]:
def convert_to_tensorflow(states, actions, rewards, next_states, dones):
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.float32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
    next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
    dones = tf.convert_to_tensor(dones, dtype=tf.float32)
    return states, actions, rewards, next_states, dones

In [314]:
class Agent:
    def __init__(self, lr, gamma, max_memory, batch_size, visual=True):
        self.n_games = 0
        self.epsilon = 1
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(max_size=max_memory, zeta=0.6)
        self.dqnetwork = DQNetwork(lr=lr, gamma=gamma)
        self.env = enviroment_visual.SnakeGameAI(speed=0) if visual else enviroment_no_visual.SnakeGameAI()
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append(convert_to_tensorflow(state, action, reward, next_state, done))

    def train_memory(self, beta=0.4):
        if len(self.memory.buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones, indices, weights = self.memory.sample(self.batch_size, beta)
        td_errors = self.dqnetwork.train_step(states, actions, rewards, next_states, dones, weights)
        self.memory.update_priorities(indices, td_errors.numpy())

    def epsilon_greedy_policy(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(3)
        else:
            Q_values = self.dqnetwork.online_model(state[np.newaxis])
            return np.argmax(Q_values[0])
        
    def get_action(self, state):
        final_move = [0, 0, 0]  
        move = self.epsilon_greedy_policy(state)
        final_move[move] = 1
        return final_move
    
    def train_agent(self, N_GAME):
        score_list = []
        record = 0
        step=0 
        while self.n_games < N_GAME:
            state_old = self.env.get_matrix_state()
            final_move = self.get_action(state_old)
            state_new, reward, done, score = self.env.play_step_m(final_move)
            self.remember(state_old, final_move, reward, state_new, done)
            if done:
                self.env.reset()
                self.n_games += 1
                self.train_memory()
                print(f"\rGame: {self.n_games}, Epsilon: {self.epsilon:3f}, Score: {score}, Record: {record}, Step eseguiti: {step}. ", end="")
                self.epsilon = max(0.01, self.epsilon * 0.995)
                if score > record:
                    record = score
                    self.dqnetwork.save_model()
                if self.n_games % 10:
                    self.dqnetwork.update_weights()
                score_list.append(score)
            step+=1
        return score_list

Mostra andamento dello score per partita durante il training

In [315]:
def plot_trend(scores, save_path=None):
    calcola_media = lambda i: sum(scores[i-50:i]) / 50
    media_precedenti = np.array([calcola_media(i) for i in range(50, len(scores) + 1)])
    max_mean_value = np.max(media_precedenti)
    max_mean_index = np.argmax(media_precedenti) + 50  
    plt.plot(scores, label='Score')
    plt.plot(range(50, len(scores) + 1), media_precedenti, label='Mean score delle ultime 50 partite')
    plt.text(max_mean_index, max_mean_value, f'{max_mean_value:.2f}', fontsize=12, color="darkorange", ha='center')
    plt.title("Andamento del training")
    plt.xlabel("Partite")
    plt.ylabel("Score")
    plt.legend()
    if save_path is not None:
        plt.savefig(save_path)
    plt.show()

Crea e allena un agente

In [307]:
agent = Agent(lr=0.001, gamma=0.99, max_memory=10_000, batch_size=8, visual=True)
training_result = agent.train_agent(N_GAME=500)
plot_trend(training_result)

(8,)
(8,)
(8,)

ciao

(8,)
(8,)
(8,)

ciao

(8,)
[0.01079709 0.01031572 0.01306524 0.00293279 0.00135436 0.00423329
 0.01018905 0.01974536]
Game: 1, Epsilon: 1.000000, Score: 0, Record: 0, Step eseguiti: 23. (8,)
[0.00091828 0.00461787 0.03691362 0.00091828 0.02302577 0.00031555
 0.0013182  0.00031555]
Game: 2, Epsilon: 0.995000, Score: 0, Record: 0, Step eseguiti: 55. (8,)
[0.00042663 0.0477029  0.00167055 0.01848498 0.00449861 0.00015063
 0.0014189  0.00817489]
Game: 3, Epsilon: 0.990025, Score: 0, Record: 0, Step eseguiti: 120. (8,)
[1.0038111e+01 1.0593869e-03 2.1786291e-02 2.1138771e-02 8.2529411e-03
 6.6241249e-05 1.2627009e-02 6.9647077e-03]
Game: 4, Epsilon: 0.985075, Score: 0, Record: 0, Step eseguiti: 160. (8,)
[0.01622419 0.00312757 0.00103642 0.01785484 0.0115378  0.00249844
 0.00067112 0.01597959]
Game: 5, Epsilon: 0.980150, Score: 0, Record: 0, Step eseguiti: 212. (8,)
[0.00832355 0.00063173 0.00124415 0.0002389  0.02332365 0.00025185
 0.00873105 0.02541934]
Game: 6, Epsi

KeyboardInterrupt: 

: 

Mostra visualmente una partita dell'Agente