In [2]:
import torch
device = 'cuda' if torch.cuda.is_available else 'cpu'
from agent import Agent, advanced_random_policy, random_policy, sarsa
from game import Game
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

### In a first time, we train an agent for 2D tic-tac-toe using Q-learning with our modified SARSA algorithm

In [19]:
%%time
game = Game(None, None, n_dim=2, size=3)
agent = Agent(size=3)

n_eps = 20000
# Trains agent with the random policy
sarsa(game, agent, random_policy, alpha=0.45, alpha_factor=0.9995**(10000/n_eps), gamma=0.7, epsilon=1.0, \
      epsilon_factor=0.9997**(10000/n_eps), r_win=11.0, r_lose=0.0, r_even=1.0, r_even2=1.25, num_episodes=20000)

100%|██████████| 20000/20000 [02:34<00:00, 129.62it/s]

Wall time: 2min 34s





We make our agent play 1000 games against the random policy

In [20]:
random_opponent = Agent(size=3, policy=random_policy)
game = Game(agent, random_opponent, n_dim=2, size=3)

agent_1_win, oppo_2_win, agent_1_even, agent_2_win, oppo_1_win, agent_2_even = game.simulate_games(1000)
tot_agent_win = agent_1_win + agent_2_win
tot_agent_lose = oppo_1_win + oppo_2_win
tot_even = agent_1_even + agent_2_even
print("On 1000 games, agent won", tot_agent_win, "times, lost", tot_agent_lose, "times and made", tot_even, "even games.")

On 1000 games, agent won 890 times, lost 0 times and made 110 even games.


Now we make our agent play 1000 games against the advanced policy

In [21]:
advanced_opponent = Agent(size=3, policy=advanced_random_policy)
game = Game(agent, advanced_opponent, n_dim=2, size=3)

agent_1_win, oppo_2_win, agent_1_even, agent_2_win, oppo_1_win, agent_2_even = game.simulate_games(1000)
tot_agent_win = agent_1_win + agent_2_win
tot_agent_lose = oppo_1_win + oppo_2_win
tot_even = agent_1_even + agent_2_even
print("On 1000 games, agent won", tot_agent_win, "times, lost", tot_agent_lose, "times and made", tot_even, "even games.")

On 1000 games, agent won 244 times, lost 7 times and made 749 even games.


We give the possibility to play against our agent

In [22]:
game = Game(agent, "Human player", n_dim=2, size=3) # agent plays first
#game = Game("Human player", agent, n_dim=2, size=3) # to play first
game.play_a_game()

. . . 
. . . 
. . . 

Agent plays : (1, 1) 

. . . 
. X . 
. . . 

Coordinates of next move : 0 2

. . O 
. X . 
. . . 

Agent plays : (0, 1) 

. X O 
. X . 
. . . 

Coordinates of next move : 21

. X O 
. X . 
. O . 

Agent plays : (1, 2) 

. X O 
. X X 
. O . 

Coordinates of next move : 1 - 0

. X O 
O X X 
. O . 

Agent plays : (0, 0) 

X X O 
O X X 
. O . 

Coordinates of next move : 2, 2

X X O 
O X X 
. O O 

Agent plays : (2, 0) 

X X O 
O X X 
X O O 

Game over. Score : (0, 0)
Even score.


(0, 0)

We can evaluate the performances of our agent depending on the number of training episodes

In [None]:
win, lose, draw, n_eps = [], [], [], []
for n in [1, 10, 100, 200, 400, 700, 1000, 2000, 3000, 4000, 5000, 7000, 10000, 15000, 20000]:
    ar = 0.9995**(10000/n)
    er = 0.9997**(10000/n)
    agent = Agent(size=3)
    random_opponent = Agent(size=3, policy=advanced_random_policy)
    game = Game(agent, random_opponent, n_dim=2, size=3)
    sarsa(game, agent, random_policy, alpha=0.45, alpha_factor=ar, gamma=0.7, epsilon=1.0, epsilon_factor=er, \
          r_win=11.0, r_lose=0.0, r_even=1.0, r_even2=1.25, num_episodes=n)
    win_p1_a, win_p2_a, tot_even_a, win_p1_b, win_p2_b, tot_even_b = game.simulate_games(10000)
    win.append(win_p1_a + win_p1_b)
    lose.append(win_p2_a + win_p2_b)
    draw.append(tot_even_a + tot_even_b)
    n_eps.append(n)
    
n_eps = np.array(n_eps)
fig = plt.figure(figsize=(0.7*6.4, 0.7*4.8))
plt.plot(n_eps, np.array(win)/100, label='win')
plt.plot(n_eps, np.array(lose)/100, label='lose')
plt.plot(n_eps, np.array(draw)/100, label='draw')
plt.xscale('log')
plt.xlabel('number of episodes played in training')
plt.ylabel('test games (in %)')
plt.ylim(0, 105)
plt.title('Performances when training\nagainst the random policy and\nplaying against the random policy')
plt.legend()

100%|██████████| 1/1 [00:00<00:00, 76.91it/s]
100%|██████████| 10/10 [00:00<00:00, 90.65it/s]
100%|██████████| 100/100 [00:01<00:00, 94.68it/s]
100%|██████████| 200/200 [00:01<00:00, 101.83it/s]
100%|██████████| 400/400 [00:03<00:00, 104.12it/s]
100%|██████████| 700/700 [00:06<00:00, 107.10it/s]
100%|██████████| 1000/1000 [00:08<00:00, 112.96it/s]
100%|██████████| 2000/2000 [00:16<00:00, 121.59it/s]
100%|██████████| 3000/3000 [00:24<00:00, 122.57it/s]


### Now, we show our Policy Gradient model for the 2D and 3D case
We start by the 2D case

We train our model with 10k iteration and batch size of 1000

In [4]:
import neuralAgent as na
import torch
device = 'cuda' if torch.cuda.is_available else 'cpu'
from game import Game

game = Game(None, None, n_dim=2, size=3)
agent1 = na.Model()
agent1, values, _, _, _ = na.train_network(agent1, game, 10000, 1000)
agent1.save()
print(values)

100%|██████████| 10000/10000 [00:43<00:00, 231.66it/s]

[array(0.20265856, dtype=float32), array(0.20265856, dtype=float32), array(0.2505353, dtype=float32), array(0.2505353, dtype=float32), array(0.24982487, dtype=float32), array(0.24982487, dtype=float32), array(0.23770878, dtype=float32), array(0.23770878, dtype=float32), array(0.23626584, dtype=float32), array(0.23626584, dtype=float32), array(0.23059121, dtype=float32), array(0.23059121, dtype=float32), array(0.2311154, dtype=float32), array(0.2311154, dtype=float32), array(0.22827327, dtype=float32), array(0.22827327, dtype=float32), array(0.2278862, dtype=float32), array(0.2278862, dtype=float32)]





We make our agent play 1000 games and then 10000 against the random policy

In [2]:
# Example of loading a saved agent
loaded_agent = na.Model()
loaded_agent.load()
wins, draw, loses = na.test_against_random(agent1, game, 1000)
print("Win {}, Draw {}, Loses {}".format(wins, draw, loses))

100%|██████████| 1000/1000 [00:04<00:00, 221.61it/s]


Win 880, Draw 24, Loses 96


In [3]:
wins, draw, loses = na.test_against_random(agent1, game, 10000)
print("Win {}, Draw {}, Loses {}".format(wins, draw, loses))

100%|██████████| 10000/10000 [00:43<00:00, 227.38it/s]


Win 8668, Draw 270, Loses 1062


In [5]:
game = Game(agent1, "Human player", n_dim=2, size=3) # agent plays first
#game = Game("Human player", agent, n_dim=2, size=3) # to play first
game.play_a_game()

. . . 
. . . 
. . . 



TypeError: get_best_possible_from_board() missing 2 required positional arguments: 'size' and 'n_dim'

#### Now the 3D case 

In [3]:
game = Game(None, None, n_dim=3, size=3)
agent1 = na.Model(3, 3)
agent1, values, _, _, _ =na.train_network(agent1, game, 10000, 1000)
#print(values)

100%|██████████| 10000/10000 [10:37<00:00, 15.68it/s]


[array(0.2168715, dtype=float32), array(0.2168715, dtype=float32), array(0.2672852, dtype=float32), array(0.2672852, dtype=float32), array(0.28510368, dtype=float32), array(0.28510368, dtype=float32), array(0.26848608, dtype=float32), array(0.26848608, dtype=float32), array(0.31130293, dtype=float32), array(0.31130293, dtype=float32), array(0.30502424, dtype=float32), array(0.30502424, dtype=float32), array(0.3202326, dtype=float32), array(0.3202326, dtype=float32), array(0.34884772, dtype=float32), array(0.34884772, dtype=float32), array(0.36489576, dtype=float32), array(0.36489576, dtype=float32)]


We show its performances against the random policy

In [4]:
wins, draw, loses = na.test_against_random(agent1, game, 1000)
print("Win {}, Draw {}, Loses {}".format(wins, draw, loses))


100%|██████████| 1000/1000 [00:58<00:00, 16.99it/s]


Win 964, Draw 25, Loses 11
