In [24]:
import numpy as np
import os
import matplotlib.pyplot as plt
import enviroment_no_visual as enviroment_no_visual
import enviroment_visual as enviroment_visual
import tensorflow as tf
import keras

Implementazione di un buffer circolare che permetta inserimento/cancellazione degli elementi e accesso random veloce. 

In [25]:
class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = np.empty(max_size, dtype=object)
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def append(self, obj):
        self.buffer[self.index] = obj
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size):
        indices = np.random.randint(self.size, size=batch_size)
        return self.buffer[indices]
    
    def sample_experiences(self, batch_size):
        batch = self.sample(batch_size)
        states, actions, rewards, next_states, game_over = [
            np.array([experience[field_index] for experience in batch])
                        for field_index in range(5)]
        return states, actions, rewards, next_states, game_over

In [26]:
def Linear_QNet(input_shape, n_hidden, units_per_hidden, output_size):
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=input_shape))
    for i in range(n_hidden):
        model.add(keras.layers.Dense(units_per_hidden[i], activation='relu'))
    model.add(keras.layers.Dense(units=output_size))
    return model

class QTrainer:

    def __init__(self, model, lr=1e-3, gamma=0.9):
        self.model = model
        self.gamma = gamma
        self.optimizer = keras.optimizers.Adam(learning_rate=lr)
        self.loss_fn = keras.losses.mean_squared_error

    def train_step(self, states, actions, rewards, next_states, dones):
        next_Q_values = self.model(next_states)
        max_next_Q_values = tf.reduce_max(next_Q_values, axis=1)
        # Equazione di Bellman: Q value = reward + discount factor * expected future reward
        target_Q_values = rewards + (1 - dones) * self.gamma * max_next_Q_values

        with tf.GradientTape() as tape:
            # train the model on the states and updated Q-values
            all_Q_values = self.model(states)  # similar to action_probs
            # apply the masks to the Q-values to get the Q-value for the action taken
            Q_values = tf.reduce_sum(all_Q_values * actions, axis=1, keepdims=True)
            # calculate loss between new Q-value and old Q-value
            loss = tf.reduce_mean(self.loss_fn(target_Q_values, Q_values))
        # Backpropagation
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

In [27]:
# Espande di una dimensione. Quando si vuole (1,x) ma hai (1,)
def add_dimension(states, actions, rewards, next_states, dones):
    state = tf.expand_dims(states, axis=0)
    action = tf.expand_dims(actions, axis=0)
    reward = tf.expand_dims(rewards, axis=0)
    next_state = tf.expand_dims(next_states, axis=0)
    done = tf.expand_dims(dones, axis=0)
    
    return state, action, reward, next_state, done

def convert_to_tensorflow(states, actions, rewards, next_states, dones):
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.float32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
    next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
    dones = tf.convert_to_tensor(dones, dtype=tf.float32)

    return states, actions, rewards, next_states, dones

In [28]:
class Agent:

    def __init__(self, lr, gamma, max_memory, batch_size, nn_model):
        self.n_games = 0
        self.epsilon = 1  
        self.lr = lr
        self.gamma = gamma  
        self.memory = ReplayBuffer(max_size=max_memory)
        self.batch_size = batch_size
        self.model = nn_model
        self.trainer = QTrainer(self.model, lr=self.lr, gamma=self.gamma)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((convert_to_tensorflow(state, action, reward, next_state, done)))

    def train_long_memory(self):
            states, actions, rewards, next_states, dones = self.memory.sample_experiences(self.batch_size)
            self.trainer.train_step(states, actions, rewards, next_states, dones)

    def train_short_memory(self, state, action, reward, next_state, done):
        state, action, reward, next_state, done = convert_to_tensorflow(state, action, reward, next_state, done)
        state, action, reward, next_state, done = add_dimension(state, action, reward, next_state, done)
        self.trainer.train_step(state, action, reward, next_state, done)

    def epsilon_greedy_policy(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(3)
        else:
            Q_values = self.model(state[np.newaxis])
            return np.argmax(Q_values[0])
        
    def get_action(self, state):
        final_move = [0, 0, 0]  
        move = self.epsilon_greedy_policy(state)
        final_move[move] = 1
        return final_move
    
    def train(self, N_GAME, visual=True):
        if visual:
            env = enviroment_visual.SnakeGameAI(speed=0)
        else:
            env = enviroment_no_visual.SnakeGameAI()
        plot_scores = []
        plot_mean_scores = []
        total_score = 0
        record = 0

        while self.n_games < N_GAME:
            state_old = env.get_state()
            final_move = self.get_action(state_old)

            state_new, reward, done, score = env.play_step(final_move)

            self.train_short_memory(state_old, final_move, reward, state_new, done)
            self.remember(state_old, final_move, reward, state_new, done)

            if done:
                env.reset()
                self.n_games += 1
                self.train_long_memory()
                print(f"\rGame: {self.n_games}, Epsilon: {self.epsilon:3f}, Score: {score}, Record: {record}.", end="")
                self.epsilon = max((1 - self.n_games / (0.7 * N_GAME)), 0) 
                
                if score > record:
                    record = score
                    self.save_model()
                plot_scores.append(score)
                total_score += score
                mean_score = total_score / self.n_games
                plot_mean_scores.append(mean_score)
                if record<=5 and self.n_games>100:
                    break
                
        return plot_scores, plot_mean_scores

    def save_model(self, model_dir_path="./DQNmodel", file_name='model.keras'):
        if not os.path.exists(model_dir_path):
            print(f"La cartella non esiste. Sarà creata con nome: {model_dir_path}")
            os.mkdir(model_dir_path)
        file_name = os.path.join(model_dir_path, file_name)
        self.model.save(file_name)

Mostra andamento dello score per partita durante il training

In [29]:
def plot_trand(scores, mean_scores, save_path=None):
    plt.plot(scores, label='Score')
    plt.plot(mean_scores, label='Mean score')
    last_mean_score = mean_scores[-1]
    plt.text(len(mean_scores)-1, last_mean_score, f'{last_mean_score}', fontsize=12, color="darkorange", ha='center')

    plt.title("Andamento del training")
    plt.xlabel("Partite")
    plt.ylabel("Score")
    plt.legend()
    if save_path is not None:
        plt.savefig(save_path)
    plt.show()

In [30]:
model = Linear_QNet(input_shape=[11], n_hidden=1, units_per_hidden=[256], output_size=3)
agent = Agent(lr=0.001, gamma=0.9, max_memory=100_000, batch_size=5_000, nn_model=model)
plot_scores, plot_mean_scores = agent.train(N_GAME=150, visual=True)
plot_trand(plot_scores, plot_mean_scores)

Game: 4, Epsilon: 0.971429, Score: 0, Record: 0.

NameError: name 'quit' is not defined

Carica un modello salvato

In [8]:
def load_model_from_file(model_dir_path="./DQNmodel", file_name='model.keras'):
    file_path = os.path.join(model_dir_path, file_name)
    if os.path.exists(file_path):
        print(f"Caricamento del modello da: {file_path}")
        model = keras.models.load_model(file_path)  # Carica il modello
        return model
    else:
        raise FileNotFoundError(f"Il file {file_path} non esiste.")

# GRID SEARCH PER IPERPARAMETRI

In [None]:
lr_space = [0.05, 0.01, 0.001]
gamma_space = [0.9, 0.95, 0.99]
max_memory_space = [50_000, 100_000]
batch_size_space =[1000, 2500, 5000]
model_space = [Linear_QNet([11], 1, [128], 3), Linear_QNet([11], 1, [256], 3), Linear_QNet([11], 1, [512], 3),
               Linear_QNet([11], 2, [64, 64], 3), Linear_QNet([11], 2, [128, 64], 3), Linear_QNet([11], 2, [64, 128], 3)]
model_name = ["h=1,128", "h=1,256", "h=1,512", 
              "h=2,64-64", "h=2,128-64", "h=2,64-128"]
N_GAME = 150

for lr_curr in lr_space:
    for gamma_curr in gamma_space:
        for max_memory_curr in max_memory_space:
            for batch_size_curr in batch_size_space:
                i=0
                for model_curr in model_space:
                    file_name = "lr:"+str(lr_curr)+"_gamma:"+str(gamma_curr)+"_max_memory:"+str(max_memory_curr)+"_batch_size:"+str(batch_size_curr)+"_model:"+model_name[i]
                    print("Esecuzione di: " + file_name)
                    agent = Agent(lr_curr, gamma_curr, max_memory_curr, batch_size_curr, model_curr)
                    ls_scores, ls_mean_scores = agent.train(N_GAME=150, visual=False)
                    np.save("GridSearch/ScoreLists/"+file_name+'.npy', ls_scores)
                    plot_trand(ls_scores, ls_mean_scores, save_path="GridSearch/Plots/"+file_name+".png")
                i+=1

Testa su10 partite la performance del modello addestrato.

In [26]:
env = enviroment_no_visual.SnakeGameAI()
MAX_N_GAMES = 10
cumulative_score = 0
max_score = 0

for n_game in range(MAX_N_GAMES):
    env.reset()
    game_over = False
    state= env.get_state()
    while not game_over:
        action = np.argmax(model(state[np.newaxis])[0])
        final_move = [0,0,0]
        final_move[action] = 1
        state, reward, game_over, score = env.play_step(final_move)
    if score > max_score:
        max_score = score
    cumulative_score += score
    print(f"\rGame: {n_game}, Score: {score}, Record: {max_score}.", end="")
    
print(f"\nMean score: {cumulative_score/ MAX_N_GAMES}, Max score: {max_score}", end="")

Game: 9, Score: 31, Record: 41.
Mean score: 25.9, Max score: 41

Mostra visualmente una partita dell'Agente

In [27]:
env_visual = enviroment_visual.SnakeGameAI(speed=0)
game_over = False
state = env_visual.get_state()
while not game_over:
    action = np.argmax(model(state[np.newaxis])[0])

    final_move = [0,0,0]
    final_move[action] = 1

    state, reward, game_over, score = env_visual.play_step(final_move)