In [1]:
import os

from algorithms import *
from snake_environment import *
from states_bracket import *

In [2]:
def print_q_value(dictionary):
    for d in dictionary:
        print(f"State ({d[0]}, {d[1]}), Action {d[2]} : Value {dictionary[d]}")

def opposite_action(action):
    return {0:1, 1:0, 2:3, 3:2}[action]

In [11]:
# Bracketer
bracketer = FoodRelativePositionBracket()
# General Settings 
gamma = 0.9
lr_v = 0.15
epsilon = 0.4
n_episodes = 5000

Proviamo il QLearning

In [12]:
# Environment
env = SnakeEnv(render_mode="nonhuman")
Q_p = QLearning(env.action_space.n, gamma=gamma, lr_v=lr_v)
Q_p.learning(env, epsilon, n_episodes, bracketer)

0
500
1000


KeyboardInterrupt: 

In [None]:
path = "./models/"
Q_p.save(f"{path}gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [5]:
env = SnakeEnv(render_mode="human")
path = "./models/"
Q_p = QLearning(env.action_space.n, gamma=gamma, lr_v=lr_v)
Q_p.upload(f"{path}QLearning gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [6]:
Q_p.play(env, bracketer)

Proviamo SARSA

In [7]:
# Environment
env = SnakeEnv(render_mode="nonhuman")

SARSA_p = SARSA(env.action_space.n, gamma=gamma, lr_v=lr_v)
#SARSA_p.learning(env, epsilon, n_episodes, bracketer)

In [6]:
path = "./models/"
#SARSA_p.save(f"{path}SARSA gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [8]:
SARSA_p.upload(f"{path}SARSA gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [12]:
done = False
keep = True
env = SnakeEnv(render_mode="human")
state, _ = env.reset()
state = bracketer.bracket(state)
possible_action = [0, 1, 2, 3]
last_action = None

while not done and keep:
    if last_action != None:
        possible_action = [0, 1, 2, 3]
        possible_action.remove(opposite_action(last_action))
    action = SARSA_p.get_action_greedy(state, possible_action=possible_action)
    last_action = action
    state, reward, done, trunc, inf = env.step(action)
    state = bracketer.bracket(state)
    keep = env.render()

env.close()

Setting gamma to 0.999 (so having a time horizon of 1000 and a maximum number of steps of 1000) make the agent learn something about how to approach food in order to prevent to end in its own tail.

Proviamo DDQN


In [3]:
BATCH_SIZE = 128
MEMORY_SIZE = 10000
TARGET_UPDATE_FREQ = 200

# Bracketer
bracketer = FoodRelativePositionBracket()
# General Settings
gamma = 0.95
lr_v = 0.001
epsilon = 0.1
n_episodes = 20001

# Environment
env = SnakeEnv(render_mode="nonhuman")
state_dim = bracketer.get_state_dim()

ddqn = DeepDoubleQLearning(
    env.action_space.n,
    state_dim=state_dim,
    gamma=gamma,
    lr_v=lr_v,
    batch_size=BATCH_SIZE,
    memory_size=MEMORY_SIZE,
    target_update_freq=TARGET_UPDATE_FREQ,
    device='cpu'
)

In [4]:
env = SnakeEnv(render_mode="nonhuman")
ddqn.learning(env, epsilon, n_episodes, bracketer)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000


In [5]:
current_path = os.getcwd()
model_path = os.path.join(current_path, "models/")
print(f"Model path: {model_path}")

Model path: D:\university\reinforcement\project\models/


In [6]:
ddqn.save(f"{model_path}DDQN gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [6]:
env = SnakeEnv(render_mode="human")
state_dim = bracketer.get_state_dim()
ddqn = DeepDoubleQLearning(
    env.action_space.n,
    state_dim=state_dim,
    gamma=gamma,
    lr_v=lr_v,
    batch_size=BATCH_SIZE,
    memory_size=MEMORY_SIZE,
    target_update_freq=TARGET_UPDATE_FREQ
)
ddqn.upload(f"{model_path}DDQN gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [24]:
ddqn.play(env, bracketer)

Monte Carlo

In [4]:
env = SnakeEnv(render_mode="nonhuman")

MC = Montecarlo(env.action_space.n, gamma=gamma, lr_v=lr_v)

In [5]:
MC.learning(env, epsilon, n_episodes, bracketer)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000


KeyboardInterrupt: 

In [6]:
path = "./models/"
MC.save(f"{path}MC gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [7]:
env = SnakeEnv(render_mode="human")
MC = Montecarlo(env.action_space.n, gamma=gamma, lr_v=lr_v)
MC.upload(f"{path}MC gamma {gamma} lr {lr_v} epsilon {epsilon} episodes {n_episodes}")

In [10]:
MC.play(env, bracketer)