In [150]:
import numpy as np
import enviroment
import tensorflow as tf
import keras

In [151]:
input_shape = [11]
n_outputs = 3
model = keras.models.Sequential([
    keras.layers.Dense(256, activation='relu', input_shape = input_shape),
    keras.layers.Dense(n_outputs)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [152]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        Q_values = model.predict(state[np.newaxis], verbose=0)
        return np.argmax(Q_values[0])

In [153]:
from collections import deque

replay_buffer = deque(maxlen=100_000)

In [154]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, game_over = [
        np.array([experience[field_index] for experience in batch])
                 for field_index in range(5)]
    return states, actions, rewards, next_states, game_over

In [155]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)

    final_move = [0,0,0]
    final_move[action] = 1
    
    next_state, reward, game_over, score = env.play_step(final_move)
    replay_buffer.append((state, action, reward, next_state, game_over))
    return next_state, reward, game_over, score

In [156]:
batch_size = 5000
discount_factor = 0.9
optimizer = keras.optimizers.Adam(learning_rate=1e-4)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    # Campionamento delle esperienze
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    # Predizione dei valori Q per gli stati successivi
    next_Q_values = model.predict(next_states, verbose=0)
    max_next_Q_values = next_Q_values.max(axis=1)
    # Calcolo dei target utilizzando l'equazione di Bellman
    runs = 1.0 - dones 
    target_Q_values = rewards + runs * discount_factor * max_next_Q_values
    target_Q_values = target_Q_values.reshape(-1, 1)

    mask = tf.one_hot(actions, n_outputs)
    # Calcolo della loss
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    # Calcolo dei gradienti e aggiornamento del modello
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [157]:
rewards = [] 
best_score = 0
env = enviroment.SnakeGameAI()
MAX_N_GAMES = 200

for episode in range(MAX_N_GAMES):
    print(episode)
    env.reset()    
    state = env.get_state()
    restart = False
    step = 0
    while not restart:
        epsilon = (80 - episode) / 100
        state, reward, done, score = play_one_step(env, state, epsilon)
        
        restart = done
        step+=1
    # extra code – displays debug info, stores data for the next figure, and
    #              keeps track of the best model weights so far
    print(f"\rEpisode: {episode + 1}, Steps: {step + 1}, eps: {epsilon:.3f}, score: {env.score}",
          end="")
    rewards.append(step)
    if score >= best_score:
        best_weights = model.get_weights()
        best_score = score

    training_step(batch_size)

model.set_weights(best_weights) 

0
Episode: 1, Steps: 43, eps: 0.800, score: 0.0001
Episode: 2, Steps: 23, eps: 0.790, score: 0.0002
Episode: 3, Steps: 34, eps: 0.780, score: 0.0003
Episode: 4, Steps: 51, eps: 0.770, score: 0.0004
Episode: 5, Steps: 93, eps: 0.760, score: 0.0005
Episode: 6, Steps: 27, eps: 0.750, score: 0.0006
Episode: 7, Steps: 43, eps: 0.740, score: 0.0007
Episode: 8, Steps: 72, eps: 0.730, score: 0.0008
Episode: 9, Steps: 22, eps: 0.720, score: 0.0009
Episode: 10, Steps: 75, eps: 0.710, score: 0.00010
Episode: 11, Steps: 79, eps: 0.700, score: 0.00011
Episode: 12, Steps: 15, eps: 0.690, score: 1.00012
Episode: 13, Steps: 71, eps: 0.680, score: 0.00013
Episode: 14, Steps: 44, eps: 0.670, score: 0.00014
Episode: 15, Steps: 129, eps: 0.660, score: 0.00015
Episode: 16, Steps: 64, eps: 0.650, score: 0.00016
Episode: 17, Steps: 90, eps: 0.640, score: 0.00017
Episode: 18, Steps: 62, eps: 0.630, score: 0.00018
Episode: 19, Steps: 28, eps: 0.620, score: 0.00019
Episode: 20, Steps: 25, eps: 0.610, score: 0.0

KeyboardInterrupt: 

In [148]:
best_score

183