In [1]:
import random
from collections import defaultdict, deque
from itertools import product, chain
import copy
import time
import pdb

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.optimizers import Adam

from IPython.display import clear_output, display, HTML


In [2]:

def print_board(board):
    """return string representation of board"""
    retval = ''
    for i, row in enumerate(board):
        if i:
            retval += "===========\n"
        retval += " %s\n" % " | ".join(row)
    retval += "\n"
    return retval

def print_board_html(board):
    """return string representation of board"""
    retval = '<style>table, th, td {border: 1px solid black;}</style>'
    retval += '<table>'
    for i, row in enumerate(board):
        retval += '<tr><td>'
        retval += ' %s ' % "</td><td>".join([" &nbsp; &nbsp; " if p==' ' else p for p in row])
        retval += "</td></tr>"
    retval += "</table>"
    return retval

win_initialized=False
def print_board_win(g, outstr=""):
    """print in new window, open it up the first time called"""
    global win_initialized
    s  = '<script type="text/Javascript">'
    if not win_initialized:
        s += 'var win = window.open("", "Title", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=200, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
        win_initialized=True
    s += 'win.document.body.innerHTML = \'' + print_board_html(g.board) + '<br/>' + outstr + '\';'
    s += '</script>'
    return HTML(s)

In [3]:
BOARD_SIZE = 3
MAX_MOVES = BOARD_SIZE * BOARD_SIZE
INIT_BOARD = ((' ',' ',' '),(' ',' ',' '),(' ',' ',' '),)

class Game:
    """Maintain game state"""
    
    def __init__(self, startplayer='X'):
        self.board = INIT_BOARD
        self.player = startplayer
        self.history = []

    def __repr__(self):
        """string representation of board"""
        return print_board(self.board)
        
    def play(self, move, record=True):
        """given move as row, col, player tuple, update board
        record=True: update game state and return board; 
        record=False: just return resulting board, i.e. for evaluation
        """
        i, j, player = move
        if player != self.player:
            raise(Exception("play: wrong player %s" % (player)))
        elif i >= len(self.board) or j >= len(self.board):
            raise(Exception("play: bad square coords %d, %d" % (i,j)))
        elif self.board[i][j] != ' ':
            raise(Exception("play: move to non-empty square"))
        else:
            # new tuple, same except set square = player
            new_board = tuple(row if r != i \
                              else tuple(player if c==j else square for c, square in enumerate(row)) \
                              for r, row in enumerate(self.board))
            if record:
                self.board=new_board
                self.history.append(self.board)
                self.player = 'O' if self.player=='X' else 'X'
                
        return new_board
    
    def is_winner(self, player='X'):
        b_a = np.array(self.board)
        if any(all(b_a[i, :] == player) for i in range(BOARD_SIZE)):
            return True
        if any(all(b_a[:, j] == player) for j in range(BOARD_SIZE)):
            return True
        if all(np.diagonal(b_a) == player):
            return True
        if all(np.diagonal(np.fliplr(b_a)) == player):
            return True
        # no winning conditions are True
        return False
    
g = Game()
g

   |   |  
   |   |  
   |   |  


In [4]:
g.play((1,1,'X'))
# some bad moves
#g.play((2,2,'X')) # wrong player
#g.play((1,1,'O')) # occupied square
#g.play((3,1,'O')) # off board
g.play((0,1,'O'))
g.play((0,0,'X'))
g.play((2,2,'O'))
g.play((2,0,'X'))
g.play((1,0,'O'))
g.play((0,2,'X'))
print(g.is_winner('O'))
print(g.is_winner('X'))
g

False
True


 X | O | X
 O | X |  
 X |   | O


In [5]:
# check winning boards
for bx in [((' ', ' ', ' '),(' ', ' ', ' '),(' ', ' ', ' ')),
           (('X', 'X', 'X'),(' ', ' ', ' '),(' ', ' ', ' ')),
           ((' ', ' ', ' '),('X', 'X', 'X'),(' ', ' ', ' ')),
           ((' ', ' ', ' '),(' ', ' ', ' '),('X', 'X', 'X')),
           (('X', ' ', ' '),('X', ' ', ' '),('X', ' ', ' ')),
           ((' ', 'X', ' '),(' ', 'X', ' '),(' ', 'X', ' ')),
           ((' ', ' ', 'X'),(' ', ' ', 'X'),(' ', ' ', 'X')),
           (('X', ' ', ' '),(' ', 'X', ' '),(' ', ' ', 'X')),
           ((' ', ' ', 'X'),(' ', 'X', ' '),('X', ' ', ' ')),]:
    g.board = bx
    print(g.is_winner('X'))
    print(g)


False
   |   |  
   |   |  
   |   |  


True
 X | X | X
   |   |  
   |   |  


True
   |   |  
 X | X | X
   |   |  


True
   |   |  
   |   |  
 X | X | X


True
 X |   |  
 X |   |  
 X |   |  


True
   | X |  
   | X |  
   | X |  


True
   |   | X
   |   | X
   |   | X


True
 X |   |  
   | X |  
   |   | X


True
   |   | X
   | X |  
 X |   |  




In [6]:
LEARNING_RATE = 0.25
DISCOUNT_RATE = 0.05
EXPLORATION_RATE = 0.025 

class RLagent:
    """Simple reinforcement learning agent
    initialize with an array (defaultdict) with value for each board
    select_move: given a board, determine valid moves, play move with best value 
    (or explore random move with probability EXPLORATION_RATE)
    train: traverse all game boards in game state history, 
    update value array based on winner, discount rate, learning rate
    """
    def __init__(self, 
                 game, 
                 V_dict,
                 player='O',
                 learning_rate=LEARNING_RATE,
                 discount_rate=DISCOUNT_RATE,
                 exploration_rate=EXPLORATION_RATE
                ):
        self.game = game
        self.V = V_dict
        self.player = player
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.exploration_rate = exploration_rate
        
    def valid_moves(self):
        retlist = []
        for i, row in enumerate(self.game.board):
            for j, colval in enumerate(row):
                if colval == ' ':
                    move = (i,j, self.player)
                    retlist.append(move)
        return retlist

    def select_move(self, verbose=False, exploration_rate=None):
        """select best scoring action, 
        if more than one have best score pick random from best"""
        moves = self.valid_moves()

        if not exploration_rate:
            exploration_rate = self.exploration_rate
        
        # choose a random move some % of time specified by exploration rate
        if random.uniform(0,1) < exploration_rate:
            # set all scores to 0.5
            scores = [0.5 for b in moves]
            boards = [self.game.play(move, record=False) for move in moves]
            if verbose:
                print("Random exploration")
        else:
            # look up boards without recording
            boards = [self.game.play(move, record=False) for move in moves]
            # look up scores
            scores = [self.V[board] for board in boards]

        if verbose:
            for i, s in enumerate(scores):
                print("%d.  %.04f\n%s" % (i, s, print_board(boards[i])))

        # if player is X, choose highest prob of X winning else lowest prob of X winning
        best_score = max(scores) if self.player == 'X' else min(scores)
        # get all scores matching best
        best_moves = [moves[i] for i, score in enumerate(scores) if score == best_score]
        # pick one
        return random.choice(best_moves)

    def train(self):
        # update value function based on winner at end of game
        
        # last board gets value of 1 if X wins, 0 if O wins, 0.5 if draw
        reward = 1  if self.game.is_winner('X') \
            else -1 if self.game.is_winner('O') \
            else 0
        
        for b in reversed(self.game.history):
            # update value of each board you see, by (learning rate %) of the way to current reward
            old = self.V[b]
            self.V[b] = old + (reward - old) * self.learning_rate
            # discount reward as boards get older 
            reward = reward * (1-self.discount_rate)
            
            if verbose:
                print("old %.04f new %.04f\n%s"% (old, self.V[b], print_board(b)))


In [7]:
class HumanAgent:
    """human player, get moves from input"""

    def __init__(self, game, player):
        self.game = game
        self.player = player
        
    def get_dim(self, prompt):
        """get a single row or column input"""
        dim = None
        while dim not in range(1,BOARD_SIZE+1):
            print(prompt)
            inputstr = input()
            dim = int(inputstr) if inputstr else -1
        return int(dim)-1

    def get_move(self):
        while True:
            row = self.get_dim("Enter row: ")
            col = self.get_dim("Enter column: ")
            try:
                self.game.play((row, col, self.player))
            except Exception as e:
                print(str(e))
                continue
            break
        return row, col

def play_again():
    print('Play again? (y or n)')
    return input().lower().startswith('y')
    


In [8]:
# play human v. human

while True:
    g = Game()    
    playerX = HumanAgent(g, 'X')
    playerO = HumanAgent(g, 'O')
    
    max_moves = BOARD_SIZE * BOARD_SIZE
    winner = None
    
    for _ in range(max_moves):
        clear_output()
        print(g)
        
        player = g.player
        if player == 'X':
            row, col = playerX.get_move()
        else:
            row, col = playerO.get_move()

        if g.is_winner(player):
            winner = player
            break

    clear_output()
    print(g)
            
    if winner is None:
        print("Draw")
    else:
        print("%s wins!" % winner)

    if not play_again():
        print("Bye!")
        break


 X |   | O
 X | O |  
 X |   |  


X wins!
Play again? (y or n)
n
Bye!


In [13]:
#%%time
# play a bunch of games computer v. computer and update V table

START_EXPLORATION_RATE = 0.25
NUM_GAMES = 99999
V = defaultdict(lambda: 0)
verbose = False

def play_game(V,
              board_size=BOARD_SIZE,
              exploration_rate=START_EXPLORATION_RATE,
              outstr="",
              show_display=True,
              verbose=verbose):

    g = Game()    
    playerX = RLagent(g, V, 'X')
    playerO = RLagent(g, V, 'O')
    
    max_moves = BOARD_SIZE * BOARD_SIZE
    winner = None

    for move_counter in range(max_moves): 
        player = g.player

        if player == 'X':
            move = playerX.select_move(verbose, exploration_rate=exploration_rate)
        else:
            move = playerO.select_move(verbose, exploration_rate=exploration_rate)
        g.play(move)
        
        if g.is_winner(player):
            winner = player
            break

    if show_display:
        clear_output()
        display(HTML(outstr))
        display(HTML(print_board_html(g.board)))
        #print_board_win(g)
    #     if winner is None:
    #         display(HTML("Draw"))
    #     else:
    #         display(HTML("%s wins!" % winner))

    if verbose:
        for i, b in enumerate(g.history):
            print("Move %d" % i)
            print(V[b])
            print(print_board(b))
        
    # update V
    # players share V array and g game history so we can train either player, only train once
    playerO.train()
    
    return winner

winx, wino, draws = 0,0,0
for game_counter in range(NUM_GAMES):
    # linear epsilon decay
    exploration_rate = (1 - game_counter/NUM_GAMES) * START_EXPLORATION_RATE

    outstr = ("%s: Game %6d X wins: %d O wins %d Draws: %d" % (time.strftime("%H:%M:%S"), game_counter, 
                                                                winx, wino, draws)
              )

    show_display=True if game_counter % 1000 == 0 else False
    winner = play_game(V, exploration_rate=exploration_rate, outstr=outstr, show_display=show_display)
    if winner == None:
        draws +=1
    elif winner == 'X':
        winx +=1
    else:
        wino +=1
    
    


0,1,2
X,O,X
X,O,X
O,X,O


In [14]:
# check out a few V values

b = ((' ', ' ', ' '),(' ', ' ', ' '),(' ', ' ', ' '))
print("%f\n%s"% (V[b], print_board(b)))

b = ((' ', ' ', ' '),(' ', ' ', ' '),('X', ' ', ' '))
print("%f\n%s"% (V[b], print_board(b)))

b = ((' ', 'O', ' '),(' ', ' ', ' '),('X', ' ', ' '))
print("%f\n%s"% (V[b], print_board(b)))

b = (('X', 'O', ' '),(' ', ' ', ' '),('X', ' ', ' '))
print("%f\n%s"% (V[b], print_board(b)))

b = (('X', 'O', ' '),('O', ' ', ' '),('X', ' ', ' '))
print("%f\n%s"% (V[b], print_board(b)))

b = (('X', 'O', ' '),('O', ' ', ' '),('X', ' ', 'X'))
print("%f\n%s"% (V[b], print_board(b)))

b = (('X', 'O', ' '),('O', 'O', ' '),('X', ' ', 'X'))
print("%f\n%s"% (V[b], print_board(b)))

b = (('X', 'O', ' '),('O', 'O', ' '),('X', 'X', 'X'))
print("%f\n%s"% (V[b], print_board(b)))



0.000000
   |   |  
   |   |  
   |   |  


-0.216749
   |   |  
   |   |  
 X |   |  


0.725068
   | O |  
   |   |  
 X |   |  


0.820386
 X | O |  
   |   |  
 X |   |  


0.856862
 X | O |  
 O |   |  
 X |   |  


0.502721
 X | O |  
 O |   |  
 X |   | X


0.514714
 X | O |  
 O | O |  
 X |   | X


0.999821
 X | O |  
 O | O |  
 X | X | X




In [None]:
# play human vs. computer and learn interactively

while True:
    g = Game()    
    playerX = HumanAgent(g, 'X')
    playerO = RLagent(g, V, 'O')
    
    max_moves = BOARD_SIZE * BOARD_SIZE
    winner = None
    
    for _ in range(max_moves):
        clear_output()
        print(g)
        
        player = g.player
        if player == 'X':
            row, col = playerX.get_move()
        else:
            move = playerO.select_move(verbose=False, exploration_rate=0)            
            g.play(move)

        if g.is_winner(player):
            winner = player
            break

    clear_output()
    print(g)
    if winner is None:
        print("Draw")
    else:
        print("%s wins!" % winner)

    playerO.train()
        
    if not play_again():
        print("Bye!")
        break


In [None]:
### export csv

def v_to_dataframe(V):

    z = defaultdict(list)

    for s, v in V.items():
        # flatten
        s = tuple(chain.from_iterable(s))
        # map s to floats
        #s = tuple(0 if i==' ' else 1 if i=='X' else -1 for i in s)
        #templist = z[s]
        #templist.append(v)
        z[s]=v
        
    Vdf = pd.DataFrame(z.keys())
    Vdf['val']=z.values()
    return Vdf

Vdf = v_to_dataframe(V)
Vdf.to_csv('V.csv')
Vdf.head(30)


In [None]:
def V_from_csv(filename):
    Vdf = pd.read_csv(filename)
    Vdf = Vdf[['0', '1', '2', '3', '4', '5', '6', '7', '8', 'val']]
    z = defaultdict(lambda: 0.5)
    # make rows into tuples 
    for row in range(len(Vdf)):
        b = ((Vdf.iloc[row][0], Vdf.iloc[row][1], Vdf.iloc[row][2]),
             (Vdf.iloc[row][3], Vdf.iloc[row][4], Vdf.iloc[row][5]),
             (Vdf.iloc[row][6], Vdf.iloc[row][7], Vdf.iloc[row][8]),
            )
        z[b] = Vdf.iloc[row]['val']
    return Vdf

Vdf = V_from_csv('V.csv')
Vdf.head()

# pickle would have been easier

In [None]:
# agent that uses neural network instead of lookup table/linear model

LEARNING_RATE = 0.4
DISCOUNT_RATE = 0.05
EXPLORATION_RATE = 0.1
QUEUE_LEN = 1000
INPUT_DIM=9

# V_hist_columns=['0','1','2','3','4','5','6','7','8','val']
# V_hist = pd.DataFrame(columns=V_hist_columns)

class DeepRLagent:
    """Instead of updating a V dict in training, add experienced reward values to dataframe
    and train neural net to predict boards based on the experienced values
    """
    
    def __init__(self, 
                 game, 
                 V_model,
                 V_hist,
                 player='O',
                 discount_rate=DISCOUNT_RATE,
                 exploration_rate=EXPLORATION_RATE
                ):
        self.game = game
        self.player = player
        self.discount_rate = discount_rate
        self.exploration_rate = exploration_rate
        self.V_hist = V_hist
        self.V_model = V_model
        self.best_metric = None
        self.best_model = None
        
    def valid_moves(self):
        retlist = []
        for i, row in enumerate(self.game.board):
            for j, colval in enumerate(row):
                if colval == ' ':
                    move = (i,j, self.player)
                    retlist.append(move)
        return retlist

    def select_move(self, verbose=False, exploration_rate=None):
        """select best scoring action, 
        if more than one have best score pick random from best"""
        moves = self.valid_moves()

        if not exploration_rate:
            exploration_rate = self.exploration_rate
        
        # choose a random move some % of time specified by exploration rate
        if random.uniform(0,1) < exploration_rate: #or not hasattr(self.V_model, 'coefs_'):
            # set all scores to 0.5
            scores = [0.5 for b in moves]
            boards = [self.game.play(move, record=False) for move in moves]
            if verbose:
                print("Random exploration")
        else:
            # look up boards without recording
            boards = [self.game.play(move, record=False) for move in moves]
            # look up scores
            flatboards = [np.array(self.flatten(b)).reshape(1,-1) for b in boards]
            scores = [self.V_model.predict(b) for b in flatboards]

        if verbose:
            for i, s in enumerate(scores):
                print("%d.  %.04f\n%s" % (i, s, print_board(boards[i])))

        # if player is X, choose highest prob of X winning else lowest prob of X winning
        best_score = max(scores) if self.player == 'X' else min(scores)
        # get all scores matching best
        best_moves = [moves[i] for i, score in enumerate(scores) if score == best_score]
        # pick one
        return random.choice(best_moves)

    def flatten(self, b):
        """convert board to a flat array of ints representation"""
        #flatten
        retlist = list(chain.from_iterable(b))
        # convert to ints
        retlist = [1 if player == 'X' else -1 if player=='O' else 0 for player in retlist]
        return retlist
    
    def train(self, initial_epoch=0, evaluate=False):
        # update value function based on winner at end of game
        
        # last board gets value of 1 if X wins, 0 if O wins, 0.5 if draw
        reward = 1  if self.game.is_winner('X') \
            else -1 if self.game.is_winner('O') \
            else 0
        
        for b in reversed(self.game.history):
            # append board, reward to queue/pandas dataframe
            # flatten
            observation = self.flatten(b)
            observation.append(reward)
            #append in place
            self.V_hist.loc[self.V_hist.shape[0]]=observation
            # discount value as boards get older 
            reward = reward * (1-self.discount_rate)
            
            if verbose:
                print("new reward %.04f\n%s"% (reward, print_board(b)))
        train_X = self.V_hist.iloc[-10000:,:9]
        train_y = self.V_hist.iloc[-10000:,-1]
        self.V_model.fit(train_X,
                         train_y,
                         batch_size=self.V_hist.shape[0], 
                         initial_epoch=0,
                         epochs=1,
                         verbose=0
                        )
        if evaluate:
            train_y_predict = self.V_model.predict(train_X)
            mse = mean_squared_error(train_y, train_y_predict)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(train_y, train_y_predict)
            print("MSE: %.4f RMSE %.4f MAE %.4f" % (mse, 
                                                    np.sqrt(mse), 
                                                    mae))            
            global best_metric, best_model
            if best_model is None or mae < best_metric:
                best_model = copy.copy(V_model)
                best_metric = mae
                save_model('model')
                    

In [None]:
def build_ols_model(input_size = INPUT_DIM, 
                    n_hidden_layers=1, 
                    largest_layer_size=32, 
                    activation='relu',
                    reg_penalty=0.0,
                    dropout=False,
                    verbose=True
                   ):

    model = Sequential()
    hidden_layer_size=largest_layer_size

    for i in range(n_hidden_layers):
        if verbose:
            print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                            hidden_layer_size, 
                                                                            activation,
                                                                            reg_penalty,
                                                                            dropout,
                                                                           ))
        if i and dropout:
            model.add(Dropout(dropout))

        if i==0: # first layer, specify input shape
            model.add(Dense(input_shape=(input_size,),
                            units = hidden_layer_size, 
                            activation = activation,
                            kernel_initializer = keras.initializers.glorot_uniform(),
                            kernel_regularizer=keras.regularizers.l2(reg_penalty),
                            name = "Dense%02d" % i))
        else: #use implicit input shape
            model.add(Dense(units = hidden_layer_size, 
                            activation = activation,
                            kernel_initializer = keras.initializers.glorot_uniform(),
                            kernel_regularizer=keras.regularizers.l2(reg_penalty),
                            name = "Dense%02d" % i))

        hidden_layer_size = hidden_layer_size // 2

    model.add(Dense(1, activation='linear'))

    if verbose:
        print(model.summary())

    # reduce learning rate by 1/10 vs. default, we are calling repeatedly for 1 iteration
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.0001), metrics=['mae'])

    return model

def load_model(filename, verbose=True):
    json_file = open('%s.json' % filename, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    mymodel = model_from_json(loaded_model_json)
    # load weights into new model
    mymodel.load_weights("%s.h5" % filename)
    print("Loaded saved V_model")
    mymodel.compile(loss='mse', optimizer=Adam(learning_rate=0.00001), metrics=['mae'])
    if verbose:
        print(mymodel.summary())
    return mymodel

def save_model(filename, verbose=True):
    # serialize model to JSON
    model_json = V_model.to_json()
    with open("%s.json" % filename, "w") as json_file:
        json_file.write(model_json)
        # serialize weights to HDF5
        V_model.save_weights("%s.h5" % filename)
    if verbose:
        print("Saved '%s' to disk" % filename)


In [None]:
#%%time
# use DeepRLAgent
# play a bunch of games computer v. computer and update V function approximator

START_EXPLORATION_RATE = 0.04
NUM_GAMES = 9999
verbose = False

V_hist_columns=['0','1','2','3','4','5','6','7','8','val']
V_hist = pd.DataFrame(columns=V_hist_columns)

best_metric=None
best_model=None

# V_model = build_ols_model(input_size = INPUT_DIM,
#                            n_hidden_layers=3, 
#                            largest_layer_size=128,
#                            activation='tanh',
#                            reg_penalty=0.0,
#                            dropout=0.0,
#                            verbose=True)        

# load best previous model
V_model = load_model('model')
    
def play_game(V,
              board_size=BOARD_SIZE,
              exploration_rate=START_EXPLORATION_RATE,
              train=True,
              evaluate=False,
              game_counter=0,
              verbose=verbose):

    g = Game()
    global V_model, V_hist
     
    playerX = DeepRLagent(g, V_model, V_hist, 'X')
    playerO = DeepRLagent(g, V_model, V_hist, 'O')
    
    max_moves = BOARD_SIZE * BOARD_SIZE
    winner = None
    #pdb.set_trace()
    for move_counter in range(max_moves):        
        player = g.player

        move = playerX.select_move(verbose, exploration_rate=exploration_rate) if player == 'X' \
            else playerO.select_move(verbose, exploration_rate=exploration_rate)
        g.play(move)
        
        if g.is_winner(player):
            winner = player
            break

#     if winner is None:
#         print("Draw")
#     else:
#         print("%s wins!" % winner)

    if verbose:
        for i, b in enumerate(g.history):
            print("Move %d" % i)
            print(V[b])
            print(print_board(b))
        
    # update V
    if train:
        playerO.train(initial_epoch=game_counter, evaluate=evaluate)
    
    return winner

draw_count = 0
draw_counts = []
for game_counter in range(NUM_GAMES):
    # linear epsilon decay
    exploration_rate = (1 - game_counter/NUM_GAMES) * START_EXPLORATION_RATE

    if game_counter % 100 == 0 and game_counter:
        print("%s: Finished %6d Games, Draws in last 100 games: %d" % (time.strftime("%H:%M:%S"), game_counter, draw_count))
        draw_counts.append(draw_count)
        draw_count = 0
        evaluate = True
    else:
        evaluate = False

    winner = play_game(V_model, V_hist, exploration_rate=exploration_rate, train=True, evaluate=evaluate,
                       game_counter=game_counter)
        
    if winner is None:
        draw_count += 1
        
    print("%s: Game %d %s" % (time.strftime("%H:%M:%S"), game_counter,
                              "Draw" if winner is None else "%s wins!" % winner))


In [None]:
V_model = load_model('model')

while True:
    g = Game()    
    playerX = HumanAgent(g, 'X')
    playerO = DeepRLagent(g, V_model, V_hist, 'O')
    
    max_moves = BOARD_SIZE * BOARD_SIZE
    winner = None
    
    for _ in range(max_moves):
        clear_output()
        print(g)
        
        player = g.player
        if player == 'X':
            row, col = playerX.get_move()
        else:
            move = playerO.select_move(verbose=False, exploration_rate=0)            
            g.play(move)

        if g.is_winner(player):
            winner = player
            break

    clear_output()
    print(g)
    if winner is None:
        print("Draw")
    else:
        print("%s wins!" % winner)

    if not play_again():
        print("Bye!")
        break


## 