In [1]:
import random
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np

Using TensorFlow backend.
  return f(*args, **kwds)


In [48]:
class Player:
    
    def __init__(self, name, symbol, strategy):
        self._name = name
        self._symbol = symbol
        self._strategy = strategy
    
    def play(self, board):
        '''make next move according to strategy'''
        return self._strategy.next_move(board=board)
    
    @property
    def symbol(self):
        return self._symbol
    
    @property
    def name(self):
        return self._name
    
    def __eq__(self, other):
        return other is not None and self._name == other._name and self._symbol == other._symbol
    
    def __hash__(self):
        return hash((self._name, self._symbol))

In [114]:
class Board:
    
    _EMPTY_SYMBOL = ' '
    _ROWS = 3
    _COLUMNS = _ROWS # we can't play tic tac tow on rectangular boards, can we?
    _SIZE = _ROWS * _COLUMNS
    
    def __init__(self):
        self._board = [self._EMPTY_SYMBOL] * self._SIZE
        self._history = []
    
    def __repr__(self):
        return ''.join([" {} {}".format(self._board[i], self._repr_board(i)) for i in range(self._SIZE)])
    
    def _repr_board(self, i):
        max_val = self._ROWS - 1
        return "|" if i % self._ROWS != max_val else "\n{}\n".format("-" * (self._SIZE + max_val) if i // self._ROWS != max_val else "")
        
    def update(self, row, col, player):
        index = row * self._ROWS + col
        self._board[index] = player.symbol
        self._history.append((index, player.symbol))
    
    def get_value_at_coordinate(self, row, col):
        return self._board[row * self._ROWS + col]
        
    @property
    def free_coordinates(self):
        '''returns list of tuples of coordinates having empty symbols'''
        return [(i // self._COLUMNS, i % self._COLUMNS) for i in range(self._SIZE) if self._board[i] == self._EMPTY_SYMBOL]

    @property
    def size(self):
        return self._SIZE
    
    @property
    def columns(self):
        return self._COLUMNS

    @property
    def rows(self):
        return self._ROWS

    @property
    def is_empty(self):
        return len(self.free_coordinates) == self._SIZE
    
    @property
    def is_full(self):
        return len(self.free_coordinates) == 0
    
    @property
    def history(self):
        return self._history + [(-1, self._EMPTY_SYMBOL) for i in range(len(self._history), self._SIZE)]

    def __eq__(self, other):
        return self._board == other._board
    
    def __hash__(self):
        return hash(''.join(self._board))
    
    def show(self):
        print(self.__repr__())
        
    def clone(self):
        clone = Board()
        clone._board = [self._board[i] for i in range(self._SIZE)]
        clone._history = [(h[0], h[1]) for h in self._history]
        return clone

In [115]:
class ResultChecker:
    
    def __init__(self, player_1, player_2):
        self._player_1 = player_1
        self._player_2 = player_2
    
    def check_winner(self, board):
        rows = {i: [] for i in range(board.rows)}
        cols = {i: [] for i in range(board.columns)}
        diag = []
        rdiag = []
        
        for i in range(board.size):
            row = i // board.columns
            col = i % board.columns
            rows[row].append(board.get_value_at_coordinate(row, col))
            cols[col].append(board.get_value_at_coordinate(row, col))
            if row == col:
                diag.append(board.get_value_at_coordinate(row, col))
            if row + col == board.columns - 1:
                rdiag.append(board.get_value_at_coordinate(row, col))

        for candidate in [diag, rdiag] + [v for v in rows.values()] + [v for v in cols.values()]:
            for player in [self._player_1, self._player_2]:
                if candidate == [player.symbol] * board.columns:
                    return player
                
        return None

In [116]:
class TicTacToeState:
    
    def __init__(self, board, reward):
        self._board = board
        self._reward = reward
        self._previous = []
        self._next = []
    
    @property
    def board(self):
        return self._board

    @property
    def reward(self):
        return self._reward
    
    @property
    def prev_states(self):
        return self._previous

    @property
    def next_states(self):
        return self._next

    def update_reward(self, new_reward):
        self._reward = new_reward
    
    def add_next_state(self, state):
        self._next.append(state)
    
    def add_previous_state(self, state):
        self._previous.append(state)
        
    def clone(self):
        s = TicTacToeState(self._board.clone(), self._reward)
        s._previous = [p for p in self._previous]
        s._next = [n for n in self._next]


In [117]:
class Strategy:
    '''stragegy interface, extend and implement method next_move to have a strategy'''
    def next_move(self, board):
        return None

class RandomStrategy(Strategy):

    def next_move(self, board):
        fc = board.free_coordinates
        return fc[random.randint(0, len(fc) - 1)] if len(fc) > 0 else None

class HumanStrategy(Strategy):

    def next_move(self, board):
        choice_str = raw_input("Your turn (ex: 1,2)>")
        coordinates_str = choice_str.split(",")
        return tuple(map(int, coordinates_str))

In [118]:
class ReinforcementLearningStrategy(Strategy):
    
    def __init__(self, explore=0.15, alpha=0.06):
        self._explore = explore
        self._alpha = alpha
        self._state_map = dict()
        self._choices = []
        self._target_player = None
        
    def new_match(self):
        self._choices = []

    def generate_all_states(self, player_1, player_2, target_player):
        self._target_player = target_player
        result_checker = ResultChecker(player_1, player_2)
        
        initial_state = TicTacToeState(board=Board(), reward=0)
        self._state_map[initial_state.board] = initial_state
        stack = [(initial_state, player_1), (initial_state, player_2)]
        
        while len(stack) > 0:            
            cur_state, cur_player = stack.pop()
            cur_board = cur_state.board
            
            if result_checker.check_winner(cur_board):
                continue
            
            for row, col in cur_board.free_coordinates:
                new_board = cur_board.clone()
                new_board.update(row, col, cur_player)
                
                new_state = None
                
                if new_board in self._state_map:
                    new_state = self._state_map[new_board]
                else:
                    winner = result_checker.check_winner(new_board)
                    
                    if winner == target_player:
                        reward = 1.0
                    elif winner is not None: # Player 2 won
                        reward = 0.0
                    elif len(new_board.free_coordinates) == 0: # tie
                        reward = 0.0
                    else:
                        reward = 0.1
                        
                    new_state = TicTacToeState(board=new_board, reward=reward)
                    self._state_map[new_state.board] = new_state
                    
                new_state.add_previous_state(cur_state)
                cur_state.add_next_state(new_state)                
                stack.append((new_state, player_1 if cur_player != player_1 else player_2))
                
    def next_move(self, board):
        next_state, to_update = self._choose_next_state(board)
        
        if len(self._choices) > 0 and to_update:
            self._update_reward(prev_state=self._choices[-1], next_state=next_state)
            
        self._choices.append(next_state)
        chosen_move = set(board.free_coordinates) - set(next_state.board.free_coordinates)
        return chosen_move.pop() if len(chosen_move) > 0 else None
        
    
    def _choose_next_state(self, board):
        fc = board.free_coordinates
        cur_state = self._state_map[board]
        
        if len(fc) == 0 or len(cur_state.next_states) == 0:
            return cur_state, True
        elif random.random() < self._explore: # Exploring
            choice = fc[random.randint(0, len(fc) - 1)]
            new_board = board.clone()
            new_board.update(choice[0], choice[1], self._target_player)
            return self._state_map[new_board], False
        else: # Exploiting
            state_with_max_reward = max(cur_state.next_states, key=lambda s: s.reward)
            return state_with_max_reward, True
    
    def _update_reward(self, prev_state, next_state):
        new_reward = prev_state.reward + self._alpha * (next_state.reward - prev_state.reward)
        # print("Updating reward from {} to: {}".format(prev_state.reward(), new_reward))
        prev_state.update_reward(new_reward)

In [None]:
class DeepLearningStrategy(Strategy):
    def __init__(self):
        self._model = self.init_nn()
        self._current_player = None

    @staticmethod
    def init_nn():
        board = Board()
        PLAYER_DIM = 2 # current player + target player
        BOARD_DIM = board.size # i.e. 9
        POSS_MOVE_DIM = BOARD_DIM # i.e. the next board, so same as board size
        INPUT_DIM = PLAYER_DIM + BOARD_DIM + POSS_MOVE_DIM # turn, board

        model = Sequential()
        model.add(Dense(2 * INPUT_DIM, input_dim=INPUT_DIM))
        model.add(Dense(INPUT_DIM))
        model.add(Dense(BOARD_DIM))
        model.add(Dense(1)) # predicting if the move was a 'winning' move or not
        model.add(Activation('softmax'))
        model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def generate_all_states(self, player_1, player_2, target_player):
        # storing current player, useful?
        self._current_player = target_player
        result_checker = ResultChecker(player_1, player_2)
        initial_state = TicTacToeState(board=Board(), reward=0)
        # used to check if a state has been evaluated (maybe?)
        state_map = {
            initial_state.board: initial_state
        }
        stack = [(initial_state, player_1), (initial_state, player_2)]
        
        data = []
        labels = []
        while len(stack) > 0:            
            cur_state, cur_player = stack.pop()
            cur_board = cur_state.board
            
            if result_checker.check_winner(cur_board):
                continue
            
            for row, col in cur_board.free_coordinates:
                new_board = cur_board.clone()
                new_board.update(row, col, cur_player)
                
                new_state = None
                
                if new_board in state_map:
                    new_state = state_map[new_board]
                else:
                    winner = result_checker.check_winner(new_board)
                    
                    if winner == target_player:
                        # we won, yay!
                        reward = 1.0
                    elif winner is not None: 
                        # they won, booh!
                        reward = 0.0
                    elif len(new_board.free_coordinates) == 0: 
                        # it's a tie
                        reward = 0.0
                    else:
                        # still playing
                        reward = 0.1
                    
                    new_state = TicTacToeState(board=new_board, reward=reward)
                    
                stack.append((new_state, player_1 if cur_player != player_1 else player_2))
                data.append(np.asarray(
                    [hash(cur_player), hash(target_player)] + 
                    [hash(i) for i in cur_board.history] + 
                    [hash(i) for i in new_board.history]))
                labels.append(np.asarray([reward]))
                state_map[new_state.board] = new_state

        self._model.fit(data, labels, nb_epoch=3, validation_split=0.05)

    def next_move(self, board):
        for row, col in board.free_coordinates:
            pass
        return None

In [120]:
class Match:
    
    def __init__(self, board, player_1, player_2):
        self._board = board
        self._player_1 = player_1
        self._player_2 = player_2
        self._result_checker = ResultChecker(player_1, player_2)
    
    def play(self, show=True):
        
        if not self._board.is_empty:
            raise Exception("Board is not empty!\n{}".format(self._board))
        
        cur_player = self._select_first_player(show)
        while not self._board.is_full:
            choice = cur_player.play(board=board)
            if show:
                print("Player {} ({}) has chosen {}".format(cur_player.name, cur_player.symbol, choice))
            self._board.update(choice[0], choice[1], cur_player)
            if show:
                self._board.show()
            winner = self._result_checker.check_winner(self._board)
            if winner:
                player_1.play(board=board)
                player_2.play(board=board)
                break
                
            cur_player = self._player_1 if cur_player == self._player_2 else self._player_2
        
        winner = self._result_checker.check_winner(self._board)
        if winner:
            if show:
                print("Player {} ({}) won the game! :) ".format(winner.name, winner.symbol))
        else:
            player_1.play(board=board)
            player_2.play(board=board)
            if show:
                print("It's a tie :D")
        return winner
    
    def _select_first_player(self, show):
        if show:
            print("Flipping a coin to decide which player will start the match...")
        first_player = self._player_1 if random.randint(0, 1) == 0 else self._player_2
        if show:
            print("{} will start the game".format(first_player.name))
        return first_player
    

In [122]:
rls_1 = ReinforcementLearningStrategy()
rls_2 = ReinforcementLearningStrategy()
dls_1 = DeepLearningStrategy()

In [123]:
time dls_1.generate_all_states(player_1=player_1, player_2=player_2, target_player=player_2)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 1099890 arrays: [array([ 7993904492217295360,  7993904492217295360, -6635925921834205456,
       -6635925921834205456, -6635925921834205456, -6635925921834205456,
       -6635925921834205456, -6635925921834205456, -6...

In [57]:
player_1 = Player(name='first', symbol='X', strategy=rls_1)
player_2 = Player(name='second', symbol='O', strategy=rls_2)

In [58]:
time rls_1.generate_all_states(player_1=player_1, player_2=player_2, target_player=player_1)

CPU times: user 39.4 s, sys: 106 ms, total: 39.5 s
Wall time: 39.8 s


In [59]:
time rls_2.generate_all_states(player_1=player_1, player_2=player_2, target_player=player_2)

CPU times: user 39 s, sys: 81.6 ms, total: 39.1 s
Wall time: 39.3 s


In [60]:
matches = 10000
for i in range(matches):
    if i % 1000 == 0:
        print("MATCH: {}".format(i))
    board = Board()
    rls_1.new_match()
    rls_2.new_match()
    match = Match(board=board, player_1=player_1, player_2=player_2)
    match.play(show=False)

MATCH: 0
MATCH: 1000
MATCH: 2000
MATCH: 3000
MATCH: 4000
MATCH: 5000
MATCH: 6000
MATCH: 7000
MATCH: 8000
MATCH: 9000


human_player = Player(name='Human', symbol='O', strategy=HumanStrategy())

In [18]:
board = Board()
match = Match(board=board, player_1=player_1, player_2=human_player)
match.play()

Flipping a coin to decide which player will start the match...
first will start the game
Player first (X) has chosen (1, 1)
   |   |   
-----------
   | X |   
-----------
   |   |   


Your turn (ex: 1,2)>2,1
Player Human (O) has chosen (2, 1)
   |   |   
-----------
   | X |   
-----------
   | O |   


Player first (X) has chosen (0, 0)
 X |   |   
-----------
   | X |   
-----------
   | O |   


Your turn (ex: 1,2)>2,2
Player Human (O) has chosen (2, 2)
 X |   |   
-----------
   | X |   
-----------
   | O | O 


Player first (X) has chosen (2, 0)
 X |   |   
-----------
   | X |   
-----------
 X | O | O 


Your turn (ex: 1,2)>0, 2
Player Human (O) has chosen (0, 2)
 X |   | O 
-----------
   | X |   
-----------
 X | O | O 


Player first (X) has chosen (1, 0)
 X |   | O 
-----------
 X | X |   
-----------
 X | O | O 


Player first (X) won the game! :) 


<__main__.Player instance at 0x1096ab320>