# Tic Tac Toe
With Neural Network for State Value estimation

### Outline of approach:
1. Pretrain: Play atleast 100 games and get the training data for state and values as list.  \
    Train a 2 value predictor network on this data - one for P1 and another for P2
    In the first step, predictor network gives random values, that are used for the first batch of games
2. RL Train: In a loop \
    a) play n games using the trained networks and epsilon greedy approach \
    b) record the outcomes and compute state values \
    c) use this data to retrain the two networks \
3. Train till convergence

Reference url for Tic Tac Toe environment: https://github.com/MJeremy2017/reinforcement-learning-implementation/blob/master/TicTacToe/ticTacToe.py 

### Basic package imports

In [2]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import re
import keras
from keras import layers
from keras.layers import Dense, Activation
BOARD_ROWS = 3
BOARD_COLS = 3

2024-01-28 17:40:13.807398: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-01-28 17:40:13.807430: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Classes and Keras Models

In [3]:
class State:
    '''
    Definition of a Tic-Tac-Toe board
    '''
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        # init p1 plays first
        self.playerSymbol = 1

    # get unique hash of current board state
    def getHash(self):
        self.boardHash = str(self.board.reshape(BOARD_COLS * BOARD_ROWS))
        return self.boardHash

    def winner(self):
        # row
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
        # col
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
        # diagonal
        diag_sum1 = sum([self.board[i, i] for i in range(BOARD_COLS)])
        diag_sum2 = sum([self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == 3:
            self.isEnd = True
            if diag_sum1 == 3 or diag_sum2 == 3:
                return 1
            else:
                return -1

        # tie
        # no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None

    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    positions.append((i, j))  # need to be tuple
        return positions

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    # only when game ends
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(-1)
        elif result == -1:
            self.p1.feedReward(-1)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0)
            self.p2.feedReward(0)

    # board reset
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    def play(self, rounds=100):
        winlist = []
        for i in range(rounds):
            #if i % 1000 == 0:
            #    print("Rounds {}".format(i))
            if i % 100 == 0:
                self.p1.setEps(rounds, i)
                self.p2.setEps(rounds, i)
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # take action and upate board state
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                # check board status if it is end

                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)

                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
            winlist.append(win)
        return (winlist)
    # play with human
    def play2(self):
        while not self.isEnd:
            # Player 1
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
            # take action and upate board state
            self.updateState(p1_action)
            self.showBoard()
            # check board status if it is end
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break

            else:
                # Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)

                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break
                    
                    
    def NNPlay(self, rounds=500, innerrounds=50):
        '''
        train innerrounds, capture replay buffer and train the s value networks on this data
        after every inner round the replay buffer is emptied and the process repeated again
        '''
        train_batches = int(rounds/innerrounds) + 1
        
        print ('training for {} rounds, with {} training batches and {} inner rounds'.format(rounds, train_batches, innerrounds))
        
        for i in range(train_batches):
            print ('training batch', i)
            self.play(innerrounds)
            # train player p1
            self.p1.sVNNtrain()
            # train player p2
            self.p2.sVNNtrain()
        return ()
    
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')


In [4]:
class Player:
    '''
    Class for one Tic Tac Toe player
    '''
    def __init__(self, name, eps_decay=False, start_exp_rate=0.3, end_exp_rate=0.05):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = 0.3
        self.exp_rate = start_exp_rate
        
        self.decay_gamma = 0.9
        self.states_value = {}  # state -> value

        self.eps_decay = eps_decay
        self.start_exp_rate = start_exp_rate
        self.end_exp_rate = end_exp_rate
        self.state_value_model = self.sValueNN()
        
    def getHash(self, board):
        boardHash = str(board.reshape(BOARD_COLS * BOARD_ROWS))
        return boardHash
    
    def getSVal(self, board):
        bs = board.reshape(BOARD_COLS * BOARD_ROWS)
        bost = np.reshape(bs, (-1, 9))
        sval = self.state_value_model.predict(bost, verbose=False)[0][0]
        return sval
    
    def sValueNN(self):
        model = keras.models.Sequential()
        model.add(Dense(units=4, input_dim=9, activation='linear'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='mean_squared_error', optimizer='sgd')
        return (model)
    
    def getbuffer(self):
        data = self.states_value
        ll = []
        for k in data.keys():
            yy = re.findall( r'[-/+]?\d+\.*\d*', k)
            zz = data.get(k)
            yy.append(zz)
            ll.append(yy)
        lldf = pd.DataFrame(ll)
        cols = ['x'+str(i) for i in range(9)]
        cols.append('val')
        lldf.columns = cols
        lldf.to_csv('Buffer.csv', index=False)
        return 

    def sVNNtrain(self, Xin = None, Yin = None):
        # call this function to create buffer
        self.getbuffer()
        # read the buffer
        df = pd.read_csv('Buffer.csv')
        print ('length of buffer is', len(df))
        traincols=['x' + str(i) for i in range(9)]
        testcol = 'val'
        if Xin is None:    
            Xin = df[traincols]
            Yin = df[testcol]
        self.state_value_model.fit(Xin, Yin, epochs=10, verbose=False)
        
        # empty the replay buffer after this 
        self.states_value = {}
        return 

    def loadSVNNmodel(self, path):
        self.state_value_model = keras.models.load_model(path)
    
    def setEps(self, total_games, current_game):
        if (self.eps_decay == False):
            return
        else:
            if int(current_game/100) > 0:   
            #if (np.mod(current_game+1, 100) == 0):
                self.exp_rate = self.start_exp_rate * (1. - current_game/total_games) + self.end_exp_rate * (current_game/total_games)
                if (np.mod(current_game, 1000) == 0):
                    print ('decay rate modified at {} with current value of {}'.format(str(current_game), str(self.exp_rate)))
        return 

    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Default method uses a dictionary to get the value of a state
                #value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                
                # New method use the keras model to predict the state value 
                value = self.getSVal(next_board)
                #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # print("value", value)
                if value >= value_max:
                    value_max = value
                    action = p
        # print("{} takes action {}".format(self.name, action))
        return action

    # append a hash state
    def addState(self, state):
        self.states.append(state)

    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()

In [5]:
class HumanPlayer:
    '''
    Class for a human player
    '''
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

    # append a hash state
    def addState(self, state):
        pass

    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        pass

    def reset(self):
        pass
    ''


In [6]:
p1 = Player("p1", eps_decay = False, start_exp_rate=0.2, end_exp_rate=0.02)
p2 = Player("p2", eps_decay = False, start_exp_rate=0.2, end_exp_rate=0.02)
print (p1.exp_rate, p2.exp_rate)

0.2 0.2


2024-01-28 17:40:15.239590: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2024-01-28 17:40:15.239612: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2024-01-28 17:40:15.239629: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dsrivallabha-PC): /proc/driver/nvidia/version does not exist
2024-01-28 17:40:15.240076: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Examine the defined model 

In [7]:
p1.state_value_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 40        
                                                                 
 dense_1 (Dense)             (None, 1)                 5         
                                                                 
Total params: 45
Trainable params: 45
Non-trainable params: 0
_________________________________________________________________


In [8]:
mm = p1.state_value_model
for layer in mm.layers: print(layer.get_config(), layer.get_weights())

{'name': 'dense', 'trainable': True, 'batch_input_shape': (None, 9), 'dtype': 'float32', 'units': 4, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'GlorotUniform', 'config': {'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None} [array([[ 0.24858743, -0.21132362, -0.37820578,  0.09079397],
       [-0.6750407 ,  0.00411654, -0.6061317 ,  0.29925823],
       [ 0.33388782, -0.5875468 ,  0.0451318 ,  0.16753101],
       [ 0.4180441 ,  0.58865845,  0.05096269,  0.5291933 ],
       [-0.01286185,  0.5965493 , -0.24636659,  0.26979625],
       [-0.19535029,  0.17613   ,  0.48654425, -0.48584723],
       [ 0.30961984, -0.3161221 , -0.6518061 , -0.14878696],
       [ 0.24420196, -0.28799623, -0.31704247,  0.67030394],
       [-0.07619268, -0.63899237, -0.01540369, -0.16941732]],
      dtype=float32), array([0

## Train the two players, with constant epsilon 

In [8]:
# Train the model
st = State(p1, p2)
st.NNPlay(2000)

training for 2000 rounds, with 41 training batches and 50 inner rounds
training batch 0
length of buffer is 72
length of buffer is 67
training batch 1
length of buffer is 68
length of buffer is 69
training batch 2
length of buffer is 57
length of buffer is 49
training batch 3
length of buffer is 61
length of buffer is 52
training batch 4
length of buffer is 51
length of buffer is 51
training batch 5
length of buffer is 52
length of buffer is 47
training batch 6
length of buffer is 72
length of buffer is 64
training batch 7
length of buffer is 53
length of buffer is 47
training batch 8
length of buffer is 59
length of buffer is 57
training batch 9
length of buffer is 54
length of buffer is 58
training batch 10
length of buffer is 65
length of buffer is 57
training batch 11
length of buffer is 50
length of buffer is 48
training batch 12
length of buffer is 61
length of buffer is 58
training batch 13
length of buffer is 65
length of buffer is 60
training batch 14
length of buffer is 64
le

()

### Examine and save the keras model weights

In [9]:
mm = st.p1.state_value_model
for layer in mm.layers: print(layer.get_config(), layer.get_weights())

{'name': 'dense', 'trainable': True, 'batch_input_shape': (None, 9), 'dtype': 'float32', 'units': 4, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'GlorotUniform', 'config': {'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None} [array([[ 0.40683657, -0.4158106 , -0.32265148, -0.30908552],
       [-0.3512696 ,  0.29168206,  0.4053993 ,  0.3450481 ],
       [ 0.15070567, -0.25628412, -0.00177603, -0.08401844],
       [-0.33369794, -0.55786455, -0.13194144,  0.03971705],
       [ 0.2153035 , -0.23054773, -0.6323803 , -0.5408313 ],
       [ 0.3124842 , -0.56576216,  0.2588895 ,  0.21737023],
       [-0.24144727,  0.14803237, -0.4194094 , -0.54073966],
       [ 0.36176884, -0.3991822 , -0.13209403, -0.12980594],
       [-0.16940153,  0.15293834,  0.32446122,  0.0399254 ]],
      dtype=float32), array([-

In [10]:
mm = st.p2.state_value_model
for layer in mm.layers: print(layer.get_config(), layer.get_weights())

{'name': 'dense_2', 'trainable': True, 'batch_input_shape': (None, 9), 'dtype': 'float32', 'units': 4, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'GlorotUniform', 'config': {'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None} [array([[-0.02253479, -0.33591506,  0.31116265, -0.4227688 ],
       [-0.03243079,  0.34823313, -0.1830587 , -0.64018464],
       [ 0.5267765 , -0.26305583, -0.25569472,  0.19048546],
       [-0.42434773,  0.06299321,  0.12105923,  0.00781979],
       [ 0.1615687 , -0.43870527, -0.00705629,  0.01748821],
       [-0.6103813 ,  0.32546517,  0.01076336,  0.47630298],
       [-0.26896507, -0.12461756,  0.38393894, -0.51620823],
       [-0.5953689 , -0.17032419,  0.41177627, -0.3451145 ],
       [ 0.04939187, -0.34967712,  0.1777797 , -0.3504383 ]],
      dtype=float32), array(

In [11]:
st.p1.state_value_model.save('p1model.keras')

In [12]:
st.p2.state_value_model.save('p2model.keras')

# Check learning by playing against opponent 

In [9]:
# Load the trained first player as p3
p3 = Player("p3", eps_decay = False, start_exp_rate=0, end_exp_rate=0)
p3.loadSVNNmodel('p1model.keras')

In [10]:
# Second player is a random player. Setting start_exp_rate = 1 makes all actions by this player as random
p4 = Player("p4", eps_decay = False, start_exp_rate=1, end_exp_rate=0)

In [11]:
# Play multiple batches of 100 games with trained P3 and random P4 and evaluate the winning probabilties
st = State(p3, p4)

for i in range(10):
    output = st.play(100)
    print ('expt {} in this batch of 100 games trained p1 wins {}, random p2 wins {} and ties {}'.format(i, output.count(1), output.count(-1), output.count(0)))

expt 0 in this batch of 100 games trained p1 wins 76, random p2 wins 13 and ties 11
expt 1 in this batch of 100 games trained p1 wins 75, random p2 wins 14 and ties 11
expt 2 in this batch of 100 games trained p1 wins 72, random p2 wins 20 and ties 8
expt 3 in this batch of 100 games trained p1 wins 74, random p2 wins 19 and ties 7
expt 4 in this batch of 100 games trained p1 wins 77, random p2 wins 12 and ties 11
expt 5 in this batch of 100 games trained p1 wins 74, random p2 wins 20 and ties 6
expt 6 in this batch of 100 games trained p1 wins 80, random p2 wins 10 and ties 10
expt 7 in this batch of 100 games trained p1 wins 77, random p2 wins 10 and ties 13
expt 8 in this batch of 100 games trained p1 wins 77, random p2 wins 12 and ties 11
expt 9 in this batch of 100 games trained p1 wins 79, random p2 wins 12 and ties 9


### Compare against base line, when two random players play against each other

In [12]:
# Check learning by playing two random players against each other
p5 = Player("p5", eps_decay = False, start_exp_rate=1, end_exp_rate=0)
p6 = Player("p6", eps_decay = False, start_exp_rate=1, end_exp_rate=0)

In [13]:
st = State(p5, p6)

for i in range(10):
    output = st.play(100)
    print ('expt {} in this batch of 100 games random p1 wins {}, random p2 wins {} and ties {}'.format(i, output.count(1), output.count(-1), output.count(0)))

expt 0 in this batch of 100 games random p1 wins 54, random p2 wins 30 and ties 16
expt 1 in this batch of 100 games random p1 wins 61, random p2 wins 23 and ties 16
expt 2 in this batch of 100 games random p1 wins 62, random p2 wins 24 and ties 14
expt 3 in this batch of 100 games random p1 wins 62, random p2 wins 24 and ties 14
expt 4 in this batch of 100 games random p1 wins 61, random p2 wins 26 and ties 13
expt 5 in this batch of 100 games random p1 wins 62, random p2 wins 24 and ties 14
expt 6 in this batch of 100 games random p1 wins 59, random p2 wins 27 and ties 14
expt 7 in this batch of 100 games random p1 wins 59, random p2 wins 30 and ties 11
expt 8 in this batch of 100 games random p1 wins 67, random p2 wins 24 and ties 9
expt 9 in this batch of 100 games random p1 wins 57, random p2 wins 31 and ties 12
