# 1. Install dependencies


In [1]:
!pip install keras-rl
!pip install gym


[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# 2.1. Declare classes: AI Classes: Random AI, Minimax AI


In [2]:
import random

class RandomAI:

    def __init__(self, game):
        self.game = game

    def decide_turn(self):
        board = self.game.board
        assert sum([1 for cell in board.flatten() if cell == 0]) > 0, "no place to make a turn!!!"

        coords = [0,1,2]
        row = random.choice(coords)
        column = random.choice(coords)

        while board[row, column] != 0:
            row = random.choice(coords)
            column = random.choice(coords)

        return row, column



In [3]:
from math import inf as infinity

class MinimaxAI:

    def __init__(self, game):
        self.game = game
        self.random_ai = RandomAI(game)

    def decide_turn(self):
        turns_possible = self.empty_cells()
        if len(turns_possible) == 1:
            row, column = turns_possible[0]
        elif len(turns_possible) >= 7:
            row, column = self.random_ai.decide_turn()
        else:
            chosen_node = self.minimax(10, self.game.x_next)
            row, column, _ = chosen_node
        return row, column

    def empty_cells(self):
        state = self.game.board
        results = []
        for row in range(3):
            for column in range(3):
                if state[row, column] == 0:
                    results.append((row, column))

        return results

    def minimax(self,depth, X_turn):
        state = self.game.board
        score = self.game.evaluate(state)
        game_over = score is not None

        if depth == 0 or game_over:
            return [-1, -1, score]

        if X_turn:
            best = [-1, -1, -infinity]
        else:
            best = [-1, -1, +infinity]

        for cell in self.empty_cells():
            x, y = cell[0], cell[1]
            self.game.board[x, y] = 1 if X_turn else -1
            score = self.minimax(depth - 1, not X_turn)
            self.game.board[x, y] = 0
            score[0], score[1] = x, y

            if X_turn:
                if score[2] > best[2]:
                    best = score  # max value
            else:
                if score[2] < best[2]:
                    best = score  # min value

        return best

# 2.2. Environment class - the actual game implementation

In [4]:
# DEFINE THE TTT GAME CLASS

import numpy as np
import random
from functools import lru_cache
from gym.core import Env
from gym import spaces

class TicTacToe(Env):

    def __init__(self):
        self.board = np.zeros([3,3], dtype=np.int8)
        self.x_next = True if random.random() > 0.5 else False

        # by design, our AI plays x and the hard-coded AI plays for o
        self.x_ai = None
        self.o_ai = MinimaxAI(self)

        # action space consists of 9 distinct actions - trying to place your shape to each of 9 cells.
        self.action_space = spaces.Discrete(9)

        # observation space is the state of the board, as numpy array with possible values of
        # -1 - cell with 'o',
        # 0  - empty cell,
        # 1  - cell with 'x'
        self.observation_space = spaces.Box(low=-1, high=1, dtype=np.int8, shape=(3,3))


    def reset(self):
        """
        Resets the board for the next game. Starting player is random.
        """
        self.board = np.zeros([3, 3])
        self.x_next = True if random.random() > 0.5 else False
        if not self.x_next:
            row, column = self.o_ai.decide_turn()
            assert self.try_make_turn(row, column)

        return self.board

    def step(self, action):
        """
        TODO this method is not complete. its up to you to finish it.
        :param action: a discrete action from 0 to 8. Cells are enumerated from left to right,
        with columns continuing from top to bottom, like below:

         0 | 1 | 2
        -  + - + -
         3 | 4 | 5
        -  + - + -
         6 | 7 | 8

        if invalid action is used, board does not change, but negative reward is returned.
        :return: observation, reward, done?, info (empty)
        """
        assert self.x_next
        row = action // 3
        column = action % 3
        info = {}

        valid_turn = self.try_make_turn(row,column)
        if valid_turn:
            result = self.evaluate(self.board)
            if result is None:
                row, column = self.o_ai.decide_turn()
                assert self.try_make_turn(row, column)
                result = self.evaluate(self.board)

            if result is None:
                return self.board, 0, False, info
            else:
                return self.board, result, True, info

        else:
            return self.board, -1.5, True, info


    def try_make_turn(self, row, column):
        """
        The current player according to the boolean self.x_next tries to make a turn
        by placing their shape on the crossection of the crossection of :param row and :param column
        :return: True if it was a valid turn, False if such turn is not allowed (the cell is not empty).
        """

        if self.board[row,column] == 0:

            self.board[row, column] = 1 if self.x_next else -1
            self.x_next = not self.x_next
            return True

        else:
            return False

    @staticmethod
    def evaluate(board):
        """
        takes a :param board as the input ([3,3] numpy array),
        :return:
            None if the game is not over,
            1 if x won,
            -1 if o won,
            0 if its a draw
        """
        board_as_tuple = tuple(tuple(board[row]) for row in range(3))
        return TicTacToe._evaluate(board_as_tuple)

    def play_ai_game(self):
        while self.evaluate(self.board) is None:
            ai = self.x_ai if self.x_next else self.o_ai
            row, column = ai.decide_turn()
            valid_turn = self.try_make_turn(row, column)
            if not valid_turn:
                # any AI that makes invalid turn immediately loses the game.
                return -1 if self.x_next else 1
        return self.evaluate(self.board)

    @property
    def board_as_tuple(self):
        return tuple(tuple(self.board[row]) for row in range(3))


    @staticmethod
    @lru_cache(maxsize=2**16)
    def _evaluate(board):

        sums = []
        # we collect totals of all row, columns and diagonals. Any of those must have a value of
        # either 3 or -3 if there are x x x or o o o in this sequence.
        sums += [sum(board[row]) for row in range(3)]
        sums += [sum([board[i][column] for i in range(3)]) for column in range(3)]
        sum_main_diag = sum([board[i][i] for i in range(3)])
        sum_opp_diag = sum([board[i][2 - i] for i in range(3)])

        sums.append(sum_main_diag)
        sums.append(sum_opp_diag)
        
        n_steps = sum(abs(elem) for row in board for elem in row)

        if 3 in sums:
            return 1 - n_steps / 100
        elif -3 in sums:
            return -1 + n_steps / 100
        else:
            n_empty = sum([1 for row in range(3) for cell in board[row] if cell == 0])
            if n_empty == 0:
                return 0
            else:
                return None


    def play_vs_human(self):
        print("Welcome to Tic-Tac-Toe. Your opponent is a minimax AI, who makes first turn randomly. \n"
              "Use numbers from 0 to 8 to make turns. Below is the map of numbers to cells.")
        print("""
         0 | 1 | 2
        -  + - + -
         3 | 4 | 5
        -  + - + -
         6 | 7 | 8""")
        "You play for x, your opponent plays for o"
        self.reset()
        done = False
        while not done:
            try:
                self.render()
                action = int(input(">>> "))
            except:
                print("could not parse your input. Please try again. Use numbers from 0 to 8 to make turns.")
            else:
                row = action // 3
                column = action % 3

                valid_turn = self.try_make_turn(row, column)
                if not valid_turn:
                    print("Unfortunately you can't overwrite existing shapes. \n"
                          "You have made an invalid turn and therefore lost the game.")
                    break

                result = self.evaluate(self.board)

                if result is not None:
                    self.render()
                    if result == 1:
                        print("Congratulations! you have won the game!")
                    elif result == 0:
                        print("The game has ended in a draw.")
                    break

                row, column = self.o_ai.decide_turn()
                assert self.try_make_turn(row, column)
                result = self.evaluate(self.board)

                if result is not None:
                    self.render()
                    if result == -1:
                        print("Minimax AI has won the game.")
                    elif result == 0:
                        print("The game has ended in a draw.")
                    break


        print("Thanks for playing Tic-Tac-Toe. Have a nice day and come back any time!")


    def render(self, mode="Human"):
        shapes = {-1: 'o', 0: ' ', 1: 'x'}
        print(f"{shapes[self.board[0,0]]} | {shapes[self.board[0,1]]} | {shapes[self.board[0,2]]}")
        print('- + - + -')
        print(f"{shapes[self.board[1,0]]} | {shapes[self.board[1,1]]} | {shapes[self.board[1,2]]}")
        print('- + - + -')
        print(f"{shapes[self.board[2,0]]} | {shapes[self.board[2,1]]} | {shapes[self.board[2,2]]}")

## 3.1. Create a benchmark
We first create a benchmark by testing a random agent playing against Minimax AI

In [5]:
# TEST HOW GOOD RANDOM AI IS

ttt = TicTacToe()

x_ai = RandomAI(ttt)
ttt.x_ai = x_ai

o_ai = MinimaxAI(ttt)
ttt.o_ai = o_ai


counters = {-1:0, 0:0, 1:0}
n_games = int(1e4)

def run_trials():
    for i in range(n_games):
        if i % 50 == 0:
            print(i, "out of", n_games)
            print(f"stats: out of {i} games, x has won {counters[1]} times, o - {counters[-1]} times, and there were "
                  f"{counters[0]} draws.")

        ttt.reset()
        result = ttt.play_ai_game()
        if result < 0:
            result = -1
        elif result > 0:
            result = 1
        counters[result] += 1



run_trials()

0 out of 10000
stats: out of 0 games, x has won 0 times, o - 0 times, and there were 0 draws.
50 out of 10000
stats: out of 50 games, x has won 0 times, o - 46 times, and there were 4 draws.
100 out of 10000
stats: out of 100 games, x has won 3 times, o - 86 times, and there were 11 draws.
150 out of 10000
stats: out of 150 games, x has won 6 times, o - 123 times, and there were 21 draws.
200 out of 10000
stats: out of 200 games, x has won 7 times, o - 169 times, and there were 24 draws.
250 out of 10000
stats: out of 250 games, x has won 9 times, o - 209 times, and there were 32 draws.
300 out of 10000
stats: out of 300 games, x has won 12 times, o - 251 times, and there were 37 draws.
350 out of 10000
stats: out of 350 games, x has won 13 times, o - 291 times, and there were 46 draws.
400 out of 10000
stats: out of 400 games, x has won 13 times, o - 332 times, and there were 55 draws.
450 out of 10000
stats: out of 450 games, x has won 17 times, o - 375 times, and there were 58 draws

3950 out of 10000
stats: out of 3950 games, x has won 98 times, o - 3372 times, and there were 480 draws.
4000 out of 10000
stats: out of 4000 games, x has won 101 times, o - 3416 times, and there were 483 draws.
4050 out of 10000
stats: out of 4050 games, x has won 102 times, o - 3456 times, and there were 492 draws.
4100 out of 10000
stats: out of 4100 games, x has won 103 times, o - 3498 times, and there were 499 draws.
4150 out of 10000
stats: out of 4150 games, x has won 105 times, o - 3541 times, and there were 504 draws.
4200 out of 10000
stats: out of 4200 games, x has won 107 times, o - 3584 times, and there were 509 draws.
4250 out of 10000
stats: out of 4250 games, x has won 107 times, o - 3624 times, and there were 519 draws.
4300 out of 10000
stats: out of 4300 games, x has won 108 times, o - 3668 times, and there were 524 draws.
4350 out of 10000
stats: out of 4350 games, x has won 110 times, o - 3711 times, and there were 529 draws.
4400 out of 10000
stats: out of 4400 g

7800 out of 10000
stats: out of 7800 games, x has won 217 times, o - 6601 times, and there were 982 draws.
7850 out of 10000
stats: out of 7850 games, x has won 218 times, o - 6646 times, and there were 986 draws.
7900 out of 10000
stats: out of 7900 games, x has won 219 times, o - 6690 times, and there were 991 draws.
7950 out of 10000
stats: out of 7950 games, x has won 221 times, o - 6731 times, and there were 998 draws.
8000 out of 10000
stats: out of 8000 games, x has won 222 times, o - 6775 times, and there were 1003 draws.
8050 out of 10000
stats: out of 8050 games, x has won 222 times, o - 6818 times, and there were 1010 draws.
8100 out of 10000
stats: out of 8100 games, x has won 224 times, o - 6861 times, and there were 1015 draws.
8150 out of 10000
stats: out of 8150 games, x has won 226 times, o - 6901 times, and there were 1023 draws.
8200 out of 10000
stats: out of 8200 games, x has won 230 times, o - 6944 times, and there were 1026 draws.
8250 out of 10000
stats: out of 

## 3.2. Train the reinforcement learning agent
Next, we train a neural network to play against Minimax AI

In [6]:
# TRAIN DQN FOR TIC-TAC-TOE

from __future__ import division
import argparse

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import FileLogger, ModelIntervalCheckpoint



# Get the environment and extract the number of actions.
from gym.wrappers import TimeLimit
env = TimeLimit(TicTacToe(), max_episode_steps=10)
nb_actions = env.action_space.n

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
WINDOW_LENGTH = 4
input_shape = (3,3)
input_shape = (WINDOW_LENGTH,) + input_shape

def build_model():
    model = Sequential()
    model.add(Flatten(input_shape=input_shape))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(nb_actions))
    print(model.summary())
    return model


model = build_model()

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=300000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=50000, gamma=.99,
               target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

print("about to enter the flags branching.")


# Okay, now it's time to learn something! We capture the interrupt exception so that training
# can be prematurely aborted. Notice that you can the built-in Keras callbacks!
weights_filename = 'dqn_ttt_weights.h5f'
checkpoint_weights_filename = 'dqn_ttt_weights_{step}.h5f'
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=50000)]
print("about to fit the dqn!")
# dqn.load_weights(weights_filename)
dqn.fit(env, callbacks=callbacks, nb_steps=int(4e5), log_interval=10000, verbose=1)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

# Finally, evaluate our algorithm for 10 episodes.
dqn.test(env, nb_episodes=1, visualize=False)


Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 36)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1184      
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 297       
Total params: 1,481
Trainable params: 1,481
Non-trainable params: 0
_________________________________________________________________
None
about to enter the flags branching.
about to fit the dqn!
Training for 1000000 steps ...
Interval 1 (0 steps performed)
3821 episodes - episode_reward: -1.306 [-1.500, 0.930]

Interval 2 (10000 steps performed)
3815 episodes - episode_reward: -1.304 [-1.500, 0.930]

Interval 3 (20000 steps performed)
3797 episodes - episode_reward: -1.315 [-1.500, 0.930]

Interval 4 (30000 steps performed)
3822 e

2978 episodes - episode_reward: -0.840 [-1.500, 0.930] - loss: 0.028 - mean_absolute_error: 0.669 - mean_q: 0.048 - mean_eps: 0.420

Interval 66 (650000 steps performed)
2978 episodes - episode_reward: -0.823 [-1.500, 0.930] - loss: 0.027 - mean_absolute_error: 0.673 - mean_q: 0.030 - mean_eps: 0.411

Interval 67 (660000 steps performed)
2991 episodes - episode_reward: -0.801 [-1.500, 0.930] - loss: 0.030 - mean_absolute_error: 0.677 - mean_q: 0.059 - mean_eps: 0.402

Interval 68 (670000 steps performed)
2937 episodes - episode_reward: -0.777 [-1.500, 0.930] - loss: 0.030 - mean_absolute_error: 0.675 - mean_q: 0.056 - mean_eps: 0.393

Interval 69 (680000 steps performed)
2957 episodes - episode_reward: -0.771 [-1.500, 0.930] - loss: 0.028 - mean_absolute_error: 0.668 - mean_q: 0.086 - mean_eps: 0.384

Interval 70 (690000 steps performed)
2917 episodes - episode_reward: -0.753 [-1.500, 0.930] - loss: 0.030 - mean_absolute_error: 0.670 - mean_q: 0.108 - mean_eps: 0.375

Interval 71 (7000

<keras.callbacks.History at 0x7f03dd77ef98>

## 3.3. Reinforcement Learning agent class to use the same API as other AIs
NeuralAI class - an agent API using the neural network we have trained.

In [10]:
# DEFINE AGENT BASED ON THE DQN NET

from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam
from rl.memory import SequentialMemory
from rl.agents.dqn import DQNAgent


WINDOW_LENGTH = 4
input_shape = (3, 3)
input_shape = (WINDOW_LENGTH,) + input_shape
nb_actions = 9


class NeuralAI:

    def __init__(self, game, weights_path=None):
        self.game = game
        model = self.build_model()
        dqn = DQNAgent(model=model, nb_actions=nb_actions, gamma=.99,
                       memory=SequentialMemory(limit=10, window_length=WINDOW_LENGTH))

        dqn.compile(Adam(lr=1e-3), metrics=['mae'])
        import sys
        sys.path.append("../")
        weights_filename = weights_path or 'dqn_ttt_weights.h5f'
        dqn.load_weights(weights_filename)
        self.dqn = dqn

    def decide_turn(self):
        board = self.game.board
        action = self.dqn.forward(board)
        row = action // 3
        column = action % 3

        return row, column

    @staticmethod
    def build_model():
        model = Sequential()
        model.add(Flatten(input_shape=input_shape))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(nb_actions))
        return model

## 3.4. Benchmark the trained agent

The network is trained now. We benchmark it to compare it with random AI and Minimax AI; we also analyse how much computation do Minimax and the neural network agent require.

In [11]:
# TEST THE NEURAL AI

ttt = TicTacToe()

x_ai = NeuralAI(ttt)
ttt.x_ai = x_ai

o_ai = MinimaxAI(ttt)
ttt.o_ai = o_ai


counters = {-1:0, 0:0, 1:0}
n_games = int(1e4)

def run_trials():
    for i in range(n_games):
        if i % 50 == 0:
            print(i, "out of", n_games)
            print(f"stats: out of {i} games, x has won {counters[1]} times, o - {counters[-1]} times, and there were "
                  f"{counters[0]} draws.")

        ttt.reset()
        result = ttt.play_ai_game()
        if result < 0:
            result = -1
        elif result > 0:
            result = 1
        counters[result] += 1




from cProfile import Profile
profiler = Profile()
profiler.runcall(run_trials)
profiler.print_stats('cumulative')


0 out of 10000
stats: out of 0 games, x has won 0 times, o - 0 times, and there were 0 draws.
50 out of 10000
stats: out of 50 games, x has won 28 times, o - 7 times, and there were 15 draws.
100 out of 10000
stats: out of 100 games, x has won 61 times, o - 13 times, and there were 26 draws.
150 out of 10000
stats: out of 150 games, x has won 89 times, o - 19 times, and there were 42 draws.
200 out of 10000
stats: out of 200 games, x has won 117 times, o - 26 times, and there were 57 draws.
250 out of 10000
stats: out of 250 games, x has won 150 times, o - 31 times, and there were 69 draws.
300 out of 10000
stats: out of 300 games, x has won 185 times, o - 36 times, and there were 79 draws.
350 out of 10000
stats: out of 350 games, x has won 216 times, o - 40 times, and there were 94 draws.
400 out of 10000
stats: out of 400 games, x has won 242 times, o - 50 times, and there were 108 draws.
450 out of 10000
stats: out of 450 games, x has won 261 times, o - 58 times, and there were 131

3900 out of 10000
stats: out of 3900 games, x has won 2235 times, o - 465 times, and there were 1200 draws.
3950 out of 10000
stats: out of 3950 games, x has won 2257 times, o - 471 times, and there were 1222 draws.
4000 out of 10000
stats: out of 4000 games, x has won 2289 times, o - 479 times, and there were 1232 draws.
4050 out of 10000
stats: out of 4050 games, x has won 2315 times, o - 484 times, and there were 1251 draws.
4100 out of 10000
stats: out of 4100 games, x has won 2344 times, o - 488 times, and there were 1268 draws.
4150 out of 10000
stats: out of 4150 games, x has won 2377 times, o - 492 times, and there were 1281 draws.
4200 out of 10000
stats: out of 4200 games, x has won 2400 times, o - 503 times, and there were 1297 draws.
4250 out of 10000
stats: out of 4250 games, x has won 2435 times, o - 504 times, and there were 1311 draws.
4300 out of 10000
stats: out of 4300 games, x has won 2463 times, o - 511 times, and there were 1326 draws.
4350 out of 10000
stats: out

7700 out of 10000
stats: out of 7700 games, x has won 4454 times, o - 883 times, and there were 2363 draws.
7750 out of 10000
stats: out of 7750 games, x has won 4486 times, o - 888 times, and there were 2376 draws.
7800 out of 10000
stats: out of 7800 games, x has won 4517 times, o - 893 times, and there were 2390 draws.
7850 out of 10000
stats: out of 7850 games, x has won 4547 times, o - 901 times, and there were 2402 draws.
7900 out of 10000
stats: out of 7900 games, x has won 4580 times, o - 905 times, and there were 2415 draws.
7950 out of 10000
stats: out of 7950 games, x has won 4606 times, o - 916 times, and there were 2428 draws.
8000 out of 10000
stats: out of 8000 games, x has won 4632 times, o - 927 times, and there were 2441 draws.
8050 out of 10000
stats: out of 8050 games, x has won 4658 times, o - 932 times, and there were 2460 draws.
8100 out of 10000
stats: out of 8100 games, x has won 4695 times, o - 937 times, and there were 2468 draws.
8150 out of 10000
stats: out