In [10]:
import numpy as np
import copy
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import regularizers
from keras.layers import Dropout
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractmethod
from random import random
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import os
import time
import tkinter as tk
#initialize for random seeds/states
tf.keras.backend.clear_session()
os.environ['PYTHONHASHSEED'] = '0'
rng = np.random.default_rng(12345)
np.random.seed(42)
tf.random.set_seed(42)

In [11]:
class paper_game(ABC):
    def __init__(self, start_state):
        self.state=start_state
    
    @abstractmethod
    def transition(state, action, player):
        #return next state
        pass
    
    @abstractmethod
    def reward(self, state, action, player):
        #return reward immediately after action
        pass
    
    @abstractmethod
    def policy(self, state):
        #return action
        pass
    
    @abstractmethod
    def Q_func(self, state, action):
        pass
    
    @abstractmethod
    def Q_update(self, state, action, reward_1, maxQ, player):
        pass

In [12]:
class tit_tac_toe(paper_game):
#state: 3x3 array to represent board config, with 1 representing self-checker, -1 representing opponent-checker, 0 representing empty
    def __init__(self, start_state=np.zeros((3,3), dtype=np.int8), epsilon=0.9, alpha=0.5, gamma=0.8):
        self.state=start_state
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.Q = np.zeros((19683, 9))  #initialize Q matrix with state 'reference' (see ref function) and action key (see key function)    
    
    def transition(self, state, action, player):
        #action: a list of row and column index to indicate which cell is chosen
        #player: 1 or -1
        #applicable to both tic_tac_toe and Connect-4
        #return next state
        new_state = copy.deepcopy(state)
        new_state[action[0]][action[1]] = player
        return new_state
    
    def win_status(self, s):
        #s: state, 
        #return: whether win or not
        win = 3 in np.sum(s, axis=0) or 3 in np.sum(s, axis=1) or np.sum(s.diagonal())==3 or np.sum(np.fliplr(s).diagonal())==3
        return win

    def lose_status(self, s):
        #s: state, 
        #return: whether win or not
        lose = -3 in np.sum(s, axis=0) or -3 in np.sum(s, axis=1) or np.sum(s.diagonal())==-3 or np.sum(np.fliplr(s).diagonal())==-3
        return lose
    
    def ref(self, state):
        #change state matrix into vector
        ref = np.sum(np.matrix([1,3,3**2,3**3,3**4,3**5,3**6,3**7,3**8]) @ np.reshape((state + 1).flatten(),(9,1)))
        return ref
    
    def key(self, action):
        #change actions [r, c] into numbers 0-8
        return 3*action[0]+action[1]
    
    def action_list(self, state):
        act_list = np.where(state==0)
        return act_list

    def reward(self, state, action, player):
        s = player * self.transition(state, action, player)
        if self.win_status(s):
            reward = 100
        else:
            reward = 0
        return reward
    
    def Q_func(self, state, action):
        Q = self.Q[self.ref(state)][self.key(action)]
        return Q

    def Q_update(self, state, action, reward_1, maxQ, player):
        #state of board instead of current player's perspective
        #reward_1: reward of player 1 regardless of who is current player
        #maxQ: player's maxQ after another action
        #player: 1 or -1
        s = player*state
        self.Q[self.ref(s)][self.key(action)] += self.alpha*(reward_1*player + self.gamma*maxQ - self.Q[self.ref(s)][self.key(action)] )


    def random_move(self, state):
        act_list = self.action_list(state)
        i = np.random.randint(len(act_list[0]))
        action = [act_list[0][i],act_list[1][i]]
        return action

    def best_move(self, state):
        act_list = self.action_list(state)
        maxQ = -10000
        for i in range (len(act_list[0])):
            a = [act_list[0][i],act_list[1][i]]
            temp = self.Q_func(state, a)
            if temp > maxQ:
                maxQ = temp
                action = a
        return action, maxQ
    
    def policy(self, state):
        exploit = (not self.train) or rng.random()> self.epsilon or self.play
        if exploit:
            action, maxQ = self.best_move(state)
        else:
            action = self.random_move(state)
        if self.epsilon < 0.5:
            self.epsilon *= 0.9999
        else:
            self.epsilon *= 0.99999    
        return action
    
    def train(self, train_number=2000):
        self.train = True
        self.play = False
        #loop by train_number
        start_time = time.time()
        for i in range (train_number):
            if i % 10000 == 0 and i != 0:
                end_time = time.time()
                elp_time = '{:.2f}'.format(end_time - start_time)
                print(f'Training Phase, epoch {i}, elapsed time:{elp_time}')
                start_time = time.time()
                
            states =[]
            state = self.state #start state
            states.append(state)
            actions=[]
            r1=[0] #reward list at different times for player 1; for player -1: use zero sum property
            t = 0
            #choose who is X, i.e. plays first
            player = 1 #as this is self-play, we simply assume 1 always plays first and be playerX
            endgame = False
            while not endgame:
                #update his Q before action if previous has action
                s = player * state
                if t >=2:
                    action, maxQ = self.best_move(s)
                    self.Q_update(states[t-2], actions[t-2], r1[t], maxQ, player)
                    
                #action
                action = self.policy(s)
                actions.append(action)
                t += 1
                r1.append(player*self.reward(state, action, player))
                state=self.transition(state, action, player)
                states.append(state)
                #check win status and end game status, update Q for both players if end game
                endgame = self.win_status(s) or t==9
                if endgame:
                    self.Q_update(states[t-1], actions[t-1], r1[t], 0, player) #r1[t+1] is not used as there won't be t+1 when endgame is reached
                    self.Q_update(states[t-2], actions[t-2], r1[t], 0, -player)
                    
                #update player for next loop
                player *= -1
        self.last_episode = states
        
    def display_board(self, state):
        print('-------------------')
        print(state)
        print('-------------------')

    def display_episode(self):
        for i in range(len(self.last_episode)):
            print('state {} :'.format(i))
            self.display_board(self.last_episode[i])
            
    def play_game(self, computer_start = True):
        #in our game, 1 means computer, -1 means opponent (i.e. human)
        self.play = True
        self.train = False
        state = np.zeros((3,3), dtype=np.int8)
        if computer_start:
            player = 1
        else:
            player = -1
        endgame = False
        t = 0
        self.display_board(state)
        while not endgame:
            if player == -1:
                i = int(input('enter row:\n'))
                j = int(input('enter column:\n'))
                state[i,j] = -1
            else:
                i,j = self.best_move(state)[0]
                state[i,j] = 1

            self.display_board(state)   
            player *= -1
            t += 1
            endgame = self.win_status(state) or self.lose_status(state) or t==9
            
                    
   
    def play_gui(self):
        pass

In [13]:
test1 = tit_tac_toe()
test1.train(10000)
# test1.display_episode()

In [14]:
test1.play_game(computer_start = True)

-------------------
[[0 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
-------------------
[[0 0 0]
 [0 0 1]
 [0 0 0]]
-------------------
enter row:
0
enter column:
0
-------------------
[[-1  0  0]
 [ 0  0  1]
 [ 0  0  0]]
-------------------
-------------------
[[-1  0  0]
 [ 0  1  1]
 [ 0  0  0]]
-------------------
enter row:
1
enter column:
0
-------------------
[[-1  0  0]
 [-1  1  1]
 [ 0  0  0]]
-------------------
-------------------
[[-1  0  0]
 [-1  1  1]
 [ 1  0  0]]
-------------------
enter row:
0
enter column:
2
-------------------
[[-1  0 -1]
 [-1  1  1]
 [ 1  0  0]]
-------------------
-------------------
[[-1  0 -1]
 [-1  1  1]
 [ 1  1  0]]
-------------------
enter row:
0
enter column:
1
-------------------
[[-1 -1 -1]
 [-1  1  1]
 [ 1  1  0]]
-------------------


In [15]:
test2 = tit_tac_toe()
test2.train(100000)

Training Phase, epoch 10000, elapsed time:19.88
Training Phase, epoch 20000, elapsed time:25.26
Training Phase, epoch 30000, elapsed time:25.29
Training Phase, epoch 40000, elapsed time:25.52
Training Phase, epoch 50000, elapsed time:25.39
Training Phase, epoch 60000, elapsed time:25.55
Training Phase, epoch 70000, elapsed time:25.75
Training Phase, epoch 80000, elapsed time:25.79
Training Phase, epoch 90000, elapsed time:25.46


In [16]:
test2.play_game(computer_start = True)

-------------------
[[0 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
-------------------
[[0 1 0]
 [0 0 0]
 [0 0 0]]
-------------------
enter row:
1
enter column:
1
-------------------
[[ 0  1  0]
 [ 0 -1  0]
 [ 0  0  0]]
-------------------
-------------------
[[ 0  1  0]
 [ 0 -1  1]
 [ 0  0  0]]
-------------------
enter row:
0
enter column:
2
-------------------
[[ 0  1 -1]
 [ 0 -1  1]
 [ 0  0  0]]
-------------------
-------------------
[[ 0  1 -1]
 [ 0 -1  1]
 [ 1  0  0]]
-------------------
enter row:
0
enter column:
0
-------------------
[[-1  1 -1]
 [ 0 -1  1]
 [ 1  0  0]]
-------------------
-------------------
[[-1  1 -1]
 [ 0 -1  1]
 [ 1  0  1]]
-------------------
enter row:
2
enter column:
1
-------------------
[[-1  1 -1]
 [ 0 -1  1]
 [ 1 -1  1]]
-------------------
-------------------
[[-1  1 -1]
 [ 1 -1  1]
 [ 1 -1  1]]
-------------------


In [17]:
test2.play_game(computer_start = False)

-------------------
[[0 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
enter row:
1
enter column:
1
-------------------
[[ 0  0  0]
 [ 0 -1  0]
 [ 0  0  0]]
-------------------
-------------------
[[ 0  0  0]
 [ 0 -1  0]
 [ 1  0  0]]
-------------------
enter row:
2
enter column:
2
-------------------
[[ 0  0  0]
 [ 0 -1  0]
 [ 1  0 -1]]
-------------------
-------------------
[[ 1  0  0]
 [ 0 -1  0]
 [ 1  0 -1]]
-------------------
enter row:
1
enter column:
0
-------------------
[[ 1  0  0]
 [-1 -1  0]
 [ 1  0 -1]]
-------------------
-------------------
[[ 1  0  0]
 [-1 -1  1]
 [ 1  0 -1]]
-------------------
enter row:
0
enter column:
1
-------------------
[[ 1 -1  0]
 [-1 -1  1]
 [ 1  0 -1]]
-------------------
-------------------
[[ 1 -1  0]
 [-1 -1  1]
 [ 1  1 -1]]
-------------------
enter row:
0
enter column:
2
-------------------
[[ 1 -1 -1]
 [-1 -1  1]
 [ 1  1 -1]]
-------------------


In [19]:
test3 = tit_tac_toe()
test3.train(1000000) #self play 1 million times, expect 2500 seconds to run...

Training Phase, epoch 10000, elapsed time:20.07
Training Phase, epoch 20000, elapsed time:26.10
Training Phase, epoch 30000, elapsed time:26.13
Training Phase, epoch 40000, elapsed time:25.72
Training Phase, epoch 50000, elapsed time:25.81
Training Phase, epoch 60000, elapsed time:25.91
Training Phase, epoch 70000, elapsed time:25.84
Training Phase, epoch 80000, elapsed time:25.76
Training Phase, epoch 90000, elapsed time:25.78
Training Phase, epoch 100000, elapsed time:25.79
Training Phase, epoch 110000, elapsed time:25.83
Training Phase, epoch 120000, elapsed time:25.66
Training Phase, epoch 130000, elapsed time:25.80
Training Phase, epoch 140000, elapsed time:25.74
Training Phase, epoch 150000, elapsed time:25.77
Training Phase, epoch 160000, elapsed time:25.94
Training Phase, epoch 170000, elapsed time:26.20
Training Phase, epoch 180000, elapsed time:25.91
Training Phase, epoch 190000, elapsed time:25.76
Training Phase, epoch 200000, elapsed time:25.84
Training Phase, epoch 210000,

In [20]:
test3.play_game(computer_start = True)

-------------------
[[0 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
-------------------
[[1 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
enter row:
1
enter column:
1
-------------------
[[ 1  0  0]
 [ 0 -1  0]
 [ 0  0  0]]
-------------------
-------------------
[[ 1  0  1]
 [ 0 -1  0]
 [ 0  0  0]]
-------------------
enter row:
0
enter column:
1
-------------------
[[ 1 -1  1]
 [ 0 -1  0]
 [ 0  0  0]]
-------------------
-------------------
[[ 1 -1  1]
 [ 0 -1  0]
 [ 0  1  0]]
-------------------
enter row:
1
enter column:
0
-------------------
[[ 1 -1  1]
 [-1 -1  0]
 [ 0  1  0]]
-------------------
-------------------
[[ 1 -1  1]
 [-1 -1  1]
 [ 0  1  0]]
-------------------
enter row:
2
enter column:
2
-------------------
[[ 1 -1  1]
 [-1 -1  1]
 [ 0  1 -1]]
-------------------
-------------------
[[ 1 -1  1]
 [-1 -1  1]
 [ 1  1 -1]]
-------------------


In [21]:
test3.play_game(computer_start = False)

-------------------
[[0 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
enter row:
1
enter column:
1
-------------------
[[ 0  0  0]
 [ 0 -1  0]
 [ 0  0  0]]
-------------------
-------------------
[[ 0  0  0]
 [ 0 -1  0]
 [ 0  0  1]]
-------------------
enter row:
0
enter column:
2
-------------------
[[ 0  0 -1]
 [ 0 -1  0]
 [ 0  0  1]]
-------------------
-------------------
[[ 0  0 -1]
 [ 0 -1  0]
 [ 1  0  1]]
-------------------
enter row:
2
enter column:
1
-------------------
[[ 0  0 -1]
 [ 0 -1  0]
 [ 1 -1  1]]
-------------------
-------------------
[[ 0  1 -1]
 [ 0 -1  0]
 [ 1 -1  1]]
-------------------
enter row:
1
enter column:
0
-------------------
[[ 0  1 -1]
 [-1 -1  0]
 [ 1 -1  1]]
-------------------
-------------------
[[ 0  1 -1]
 [-1 -1  1]
 [ 1 -1  1]]
-------------------
enter row:
0
enter column:
0
-------------------
[[-1  1 -1]
 [-1 -1  1]
 [ 1 -1  1]]
-------------------


In [22]:
test3.play_game(computer_start = True)

-------------------
[[0 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
-------------------
[[1 0 0]
 [0 0 0]
 [0 0 0]]
-------------------
enter row:
1
enter column:
0
-------------------
[[ 1  0  0]
 [-1  0  0]
 [ 0  0  0]]
-------------------
-------------------
[[ 1  0  1]
 [-1  0  0]
 [ 0  0  0]]
-------------------
enter row:
0
enter column:
1
-------------------
[[ 1 -1  1]
 [-1  0  0]
 [ 0  0  0]]
-------------------
-------------------
[[ 1 -1  1]
 [-1  1  0]
 [ 0  0  0]]
-------------------
enter row:
2
enter column:
0
-------------------
[[ 1 -1  1]
 [-1  1  0]
 [-1  0  0]]
-------------------
-------------------
[[ 1 -1  1]
 [-1  1  0]
 [-1  0  1]]
-------------------
