# Working code, computer vs computer AI agents

In [56]:
import numpy as np
import random

# Define the Tic-Tac-Toe board
board_rows = 3
board_cols = 3

# Define the player markers
player_1 = 1
player_2 = 2

# Define the winning combinations
winning_combinations = [
    [0, 1, 2],
    [3, 4, 5],
    [6, 7, 8],
    [0, 3, 6],
    [1, 4, 7],
    [2, 5, 8],
    [0, 4, 8],
    [2, 4, 6],
]

class QLearningAgent:
    def __init__(self, alpha, gamma, epsilon, learning_rate, discount_factor, player_marker):
        self.q_table = {}
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.player_marker = player_marker

    def get_q_value(self, state, action):
        # If the state doesn't exist in the q_table, create a new entry
        if state not in self.q_table:
            self.q_table[state] = np.zeros(board_rows * board_cols)

        return self.q_table[state][action]

    def choose_action(self, state):
        # Convert the state to a bytes object
        state_bytes = bytes(state)

        # Add a new entry to the Q-table if the current state is not present
        if state_bytes not in self.q_table:
            self.q_table[state_bytes] = np.zeros(board_rows * board_cols)

        # Choose a random action with probability epsilon
        if np.random.rand() < self.epsilon:
            action = np.random.randint(board_rows * board_cols)
        else:
            # Choose the action with the highest Q-value
            q_values = self.q_table[state_bytes]
            max_q_value = np.max(q_values)
            actions = np.where(q_values == max_q_value)[0]
            action = np.random.choice(actions)

        return action


    def update_q_value(self, state, action, next_state, reward):
        # Get the current Q-value
        q_value = self.get_q_value(state, action)

        # Get the maximum Q-value for the next state
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(board_rows * board_cols)
        max_q_value = np.max(self.q_table[next_state])

        # Update the Q-value using the Bellman equation
        new_q_value = q_value + self.alpha * (reward + self.gamma * max_q_value - q_value)

        # Update the Q-table
        self.q_table[state][action] = new_q_value

class TicTacToeGame:
    def __init__(self, player_1, player_2):
        self.board = np.zeros((board_rows, board_cols))
        self.player_1 = player_1
        self.player_2 = player_2
        self.current_player = self.player_1
        self.winner = None

    def get_state(self):
        # Convert the board into a string
        return self.board.tobytes()

    def get_valid_moves(self):
        # Get the indices of all empty squares on the board
        return np.where(self.board.flatten() == 0)[0]

    def is_valid_move(self, move):
        # Check if the move is valid
        return move in self.get_valid_moves()

    def make_move(self, move):
        # Make a move on the board
        row = move // board_cols
        col = move % board_cols
        self.board[row][col] = self.current_player

    def switch_player(self):
        # Switch to the next player
        if self.current_player == self.player_1:
            self.current_player = self.player_2
        else:
            self.current_player = self.player_1

    def check_winner(self):
        # Check if any player has won
        for combination in winning_combinations:
            if (self.board.flatten()[combination] == self.current_player).all():
                self.winner = self.current_player
                return True

        # Check if the game is a draw
        if len(self.get_valid_moves()) == 0:
            self.winner = 0
            return True

        # If there is no winner or draw, return False
        return False

    def reset(self):
        # Reset the board and current player
        self.board = np.zeros((board_rows, board_cols))
        self.current_player = self.player_1
        self.winner = None

    def play_game(self, agent_1, agent_2, train=True):
        # Reset the game
        self.reset()

        # Loop until there is a winner or a draw
        while not self.check_winner():
            # Get the current state
            state = self.get_state()

            # Choose an action for the current player
            if self.current_player == self.player_1:
                action = agent_1.choose_action(state)
            else:
                action = agent_2.choose_action(state)

            # Make the move on the board
            self.make_move(action)

            # Check if there is a winner or a draw
            self.check_winner()

            # Calculate the reward for the agents
            if self.winner == agent_1.player_marker:
                reward_1 = 1
                reward_2 = -1
            elif self.winner == agent_2.player_marker:
                reward_1 = -1
                reward_2 = 1
            else:
                reward_1 = 0
                reward_2 = 0

            # Update the Q-values for the agents
            if train:
                next_state = self.get_state()

                agent_1.update_q_value(state, action, next_state, reward_1)
                agent_2.update_q_value(state, action, next_state, reward_2)

            # Switch to the next player
            self.switch_player()

        # Return the winner
        return self.winner


if __name__ == "__main__":
    # Create the agents
    agent_1 = QLearningAgent(alpha = 0.5, gamma=0.9, epsilon=0.9, learning_rate=0.1, discount_factor=0.9,
                             player_marker=player_1)
    agent_2 = QLearningAgent(alpha = 0.5, gamma=0.9, epsilon=0.9, learning_rate=0.1, discount_factor=0.9,
                             player_marker=player_2)

    # Train the agents
    num_episodes = 100000
    for i in range(num_episodes):
        game = TicTacToeGame(player_1, player_2)
        winner = game.play_game(agent_1, agent_2)
        agent_1.epsilon *= 0.999
        agent_2.epsilon *= 0.999

        if i % 10000 == 0:
            print(f"Episode {i}: winner = {winner}, epsilon = {agent_1.epsilon:.3f}")

    # Test the agents
    num_games = 100
    num_wins = 0
    for i in range(num_games):
        game = TicTacToeGame(player_1, player_2)
        winner = game.play_game(agent_1, agent_2)
        agent_1.epsilon *= 0.999
        agent_2.epsilon *= 0.999

        if winner == player_1:
            num_wins += 1

    print(f"Agent 1 win rate: {num_wins / num_games * 100:.2f}%")

Episode 0: winner = 2, epsilon = 0.899
Episode 10000: winner = 1, epsilon = 0.000
Episode 20000: winner = 1, epsilon = 0.000
Episode 30000: winner = 1, epsilon = 0.000
Episode 40000: winner = 1, epsilon = 0.000
Episode 50000: winner = 1, epsilon = 0.000
Episode 60000: winner = 1, epsilon = 0.000
Episode 70000: winner = 1, epsilon = 0.000
Episode 80000: winner = 1, epsilon = 0.000
Episode 90000: winner = 1, epsilon = 0.000
Agent 1 win rate: 100.00%


# Working code human with computer without GUI

In [65]:
import numpy as np
import random

# Define the Tic-Tac-Toe board
board_rows = 3
board_cols = 3

# Define the player markers
player_1 = 1
player_2 = 2

# Define the winning combinations
winning_combinations = [
    [0, 1, 2],
    [3, 4, 5],
    [6, 7, 8],
    [0, 3, 6],
    [1, 4, 7],
    [2, 5, 8],
    [0, 4, 8],
    [2, 4, 6],
]

class QLearningAgent:
    def __init__(self, alpha, gamma, epsilon, learning_rate, discount_factor, player_marker):
        self.q_table = {}
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.player_marker = player_marker

    def get_q_value(self, state, action):
        # If the state doesn't exist in the q_table, create a new entry
        if state not in self.q_table:
            self.q_table[state] = np.zeros(board_rows * board_cols)

        return self.q_table[state][action]

    def choose_action(self, state):
        # Convert the state to a bytes object
        state_bytes = bytes(state)

        # Add a new entry to the Q-table if the current state is not present
        if state_bytes not in self.q_table:
            self.q_table[state_bytes] = np.zeros(board_rows * board_cols)

        # Choose a random action with probability epsilon
        if np.random.rand() < self.epsilon:
            action = np.random.randint(board_rows * board_cols)
        else:
            # Choose the action with the highest Q-value
            q_values = self.q_table[state_bytes]
            max_q_value = np.max(q_values)
            actions = np.where(q_values == max_q_value)[0]
            action = np.random.choice(actions)

        return action


    def update_q_value(self, state, action, next_state, reward):
        # Get the current Q-value
        q_value = self.get_q_value(state, action)

        # Get the maximum Q-value for the next state
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(board_rows * board_cols)
        max_q_value = np.max(self.q_table[next_state])

        # Update the Q-value using the Bellman equation
        new_q_value = q_value + self.alpha * (reward + self.gamma * max_q_value - q_value)

        # Update the Q-table
        self.q_table[state][action] = new_q_value

class TicTacToeGame:
    def __init__(self, player_1, player_2):
        self.board = np.zeros((board_rows, board_cols))
        self.player_1 = player_1
        self.player_2 = player_2
        self.current_player = self.player_1
        self.winner = None

    def get_state(self):
        # Convert the board into a string
        return self.board.tobytes()

    def get_valid_moves(self):
        # Get the indices of all empty squares on the board
        return np.where(self.board.flatten() == 0)[0]

    def is_valid_move(self, move):
        # Check if the move is valid
        return move in self.get_valid_moves()

    def make_move(self, move):
        # Make a move on the board
        row = move // board_cols
        col = move % board_cols
        if self.board[row][col] == 0:
            self.board[row][col] = self.current_player
        else:
            while True:
                print("Invalid move. Please choose a valid move.")
                move = self.current_player.choose_action(self.get_state())
                row = move // board_cols
                col = move % board_cols
                if self.board[row][col] == 0:
                    self.board[row][col] = self.current_player
                    break
        return

    def switch_player(self):
        # Switch to the next player
        if self.current_player == self.player_1:
            self.current_player = self.player_2
        else:
            self.current_player = self.player_1

    def check_winner(self):
        # Check if any player has won
        for combination in winning_combinations:
            if (self.board.flatten()[combination] == self.current_player).all():
                self.winner = self.current_player
                return True

        # Check if the game is a draw
        if len(self.get_valid_moves()) == 0:
            self.winner = 0
            return True

        # If there is no winner or draw, return False
        return False

    def reset(self):
        # Reset the board and current player
        self.board = np.zeros((board_rows, board_cols))
        self.current_player = self.player_1
        self.winner = None

    def play_game(self, agent_1, train=False):
        # Reset the game
        self.reset()

        # Loop until there is a winner or a draw
        while not self.check_winner():
            # Get the current state
            state = self.get_state()

            # Choose an action for the current player
            if self.current_player == player_1:
                # Get the human player's move
                while True:
                    try:
                        move = int(input("Enter your move (0-8): "))
                        if self.is_valid_move(move):
                            break
                        else:
                            print("Invalid move, please try again")
                    except ValueError:
                        print("Invalid input, please try again")
                action = move
            else:
                # Let the AI agent choose its move
                action = agent_1.choose_action(state)
                while not self.is_valid_move(action):
                    print("Invalid move. Please choose a valid move.")
                    action = agent_1.choose_action(state)

            # Make the move on the board
            self.make_move(action)

            # Check if there is a winner or a draw
            self.check_winner()

            # Print the current board
            print(self.board)

            # Calculate the reward for the agents
            if self.winner == player_1:
                reward_1 = 1
                reward_2 = -1
            elif self.winner == player_2:
                reward_1 = -1
                reward_2 = 1
            else:
                reward_1 = 0
                reward_2 = 0

            # Update the Q-values for the agents
            if train and self.winner is not None:
                next_state = self.get_state()

                agent_1.update_q_value(state, action, next_state, reward_1)

            # Switch to the next player
            self.switch_player()

        # Print the final board
        print(self.board)

        # Print the winner
        if self.winner == player_1:
            print("You win!")
        elif self.winner == player_2:
            print("AI wins!")
        else:
            print("Draw!")


if __name__ == "__main__":
    # Create the agents
    agent_1 = QLearningAgent(alpha=0.5, gamma=0.9, epsilon=0.0, learning_rate=0.1, discount_factor=0.9,
                             player_marker=player_2)

    # Train the agents
    num_episodes = 100000
    for i in range(num_episodes):
        game = TicTacToeGame(player_1, player_2)
        winner = game.play_game(agent_1)
        agent_1.epsilon *= 0.999
        # agent_2.epsilon *= 0.999

        if i % 10000 == 0:
            print(f"Episode {i}: winner = {winner}, epsilon = {agent_1.epsilon:.3f}")

    # Test the agents
    num_games = 100
    num_wins = 0
    for i in range(num_games):
        game = TicTacToeGame(player_1, player_2)
        winner = game.play_game(agent_1)
        agent_1.epsilon *= 0.999
        # agent_2.epsilon *= 0.999

        if winner == player_1:
            num_wins += 1

    print(f"Agent 1 win rate: {num_wins / num_games * 100:.2f}%")

Enter your move (0-8): 0
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[1. 2. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Enter your move (0-8): 1
Invalid move, please try again
Enter your move (0-8): 3
[[1. 2. 0.]
 [1. 0. 0.]
 [0. 0. 0.]]
[[1. 2. 0.]
 [1. 0. 0.]
 [0. 0. 2.]]
Enter your move (0-8): 6
[[1. 2. 0.]
 [1. 0. 0.]
 [1. 0. 2.]]
Invalid move. Please choose a valid move.
Invalid move. Please choose a valid move.
[[1. 2. 0.]
 [1. 0. 0.]
 [1. 2. 2.]]
[[1. 2. 0.]
 [1. 0. 0.]
 [1. 2. 2.]]
You win!
Episode 0: winner = None, epsilon = 0.000
Enter your move (0-8): 1
[[0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[0. 1. 0.]
 [0. 0. 0.]
 [2. 0. 0.]]
Enter your move (0-8): 8
[[0. 1. 0.]
 [0. 0. 0.]
 [2. 0. 1.]]
[[0. 1. 0.]
 [0. 0. 0.]
 [2. 2. 1.]]
Enter your move (0-8): 2
[[0. 1. 1.]
 [0. 0. 0.]
 [2. 2. 1.]]
Invalid move. Please choose a valid move.
Invalid move. Please choose a valid move.
Invalid move. Please choose a valid move.
[[0. 1. 1.]
 [0. 0. 2.]
 [2. 2. 1.]]
Enter your move (0-8): 0
[[1. 1. 1.]
 [0. 0. 2.]
 [2

KeyboardInterrupt: Interrupted by user

In [None]:
# Now we will work on GUI code

# Simple GUI tic-tac-toe game without AI agent in it. 

In [20]:
class TicTacToe:
    def __init__(self):
        self.root = tk.Tk()
        self.board = [[None for _ in range(3)] for _ in range(3)]
        self.player_turn = 1  # Keep track of whose turn it is. 1 for player 1 and 2 for player 2.
        self.game_over = False  # Indicates whether the game has ended
        self.initialize_gui()

    def initialize_gui(self):
        for i in range(3):
            for j in range(3):
                button = tk.Button(self.root, text="", font=("Arial", 50), height=3, width=6,
                                   command=lambda i=i, j=j: self.button_click(i, j))
                button.grid(row=i, column=j)
                self.board[i][j] = button

    def button_click(self, i, j):
        if not self.game_over and self.board[i][j]['text'] == "":  # Only allow moves on empty squares
            marker = 'X' if self.player_turn == 1 else 'O'
            self.board[i][j].config(text=marker, state='disabled')
            if self.check_winner(i, j, marker):
                print(f"Player {self.player_turn} wins!")
                self.game_over = True
            else:
                self.player_turn = 1 if self.player_turn == 2 else 2

    def check_winner(self, i, j, marker):
        return (all(self.board[i][col]['text'] == marker for col in range(3)) or  # Check row
                all(self.board[row][j]['text'] == marker for row in range(3)) or  # Check column
                (i == j and all(self.board[index][index]['text'] == marker for index in range(3))) or  # Check main diagonal
                (i + j == 2 and all(self.board[index][2-index]['text'] == marker for index in range(3))))  # Check other diagonal

    def run(self):
        self.root.mainloop()

        
if __name__ == "__main__":
    main()

# Train the model first without GUI and then use GUI during testing. 

In [2]:
import tkinter as tk
from tkinter import messagebox
import numpy as np
import pickle

class TicTacToe:
    def __init__(self):
        self.board = [[' ' for _ in range(3)] for _ in range(3)]
        self.player_markers = ['X', 'O']

    def empty_cells(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i][j] == ' ']

    def check_winner(self):
        # Check rows
        for row in self.board:
            if len(set(row)) == 1 and row[0] != ' ':
                return row[0]
        # Check columns
        for col in range(3):
            column = [row[col] for row in self.board]
            if len(set(column)) == 1 and column[0] != ' ':
                return column[0]
        # Check diagonals
        if len(set(self.board[i][i] for i in range(3))) == 1 and self.board[0][0] != ' ':
            return self.board[0][0]
        if len(set(self.board[i][2-i] for i in range(3))) == 1 and self.board[0][2] != ' ':
            return self.board[0][2]
        # No winner
        if ' ' in (cell for row in self.board for cell in row):  # if there's still empty cells, game continues
            return None
        else:
            return 'Draw'  # If no empty cells and no winner, it's a draw


    def random_move(self):
        return self.empty_cells()[np.random.randint(len(self.empty_cells()))]

class QLearningPlayer:
    def __init__(self, alpha=0.5, gamma=0.9, epsilon=0.1, learning_rate=0.02, discount_factor=0.4):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = dict()  # Q-table
        self.state = None
        self.action = None
        self.marker = 'X'  # Player's marker

    def get_q_value(self, state, action):
        return self.q_table.get((self.state_to_str(state), action), 0)

    def choose_action(self, state):
        empty_cells = [(i, j) for i in range(3) for j in range(3) if state[i][j] == ' ']
        if np.random.rand() < self.epsilon:
            # Exploration: choose random action
            action_idx = np.random.randint(len(empty_cells))
        else:
            # Exploitation: choose action with max Q-value
            q_values = [self.get_q_value(state, action) for action in empty_cells]
            action_idx = np.argmax(q_values)
        self.state = state
        self.action = empty_cells[action_idx]
        return self.action



    def update_q_value(self, reward, next_state):
        current_q_value = self.get_q_value(self.state, self.action)
        empty_cells = [(i, j) for i in range(3) for j in range(3) if next_state[i][j] == ' ']
        if empty_cells:  # Check if there are still empty cells
            max_next_q_value = max([self.get_q_value(next_state, action) for action in empty_cells])
        else:
            max_next_q_value = 0
        new_q_value = current_q_value + self.alpha * (reward + self.gamma * max_next_q_value - current_q_value)
        self.q_table[(self.state_to_str(self.state), self.action)] = new_q_value



    def state_to_str(self, state):
        # Change here: replace the ' ' with a neutral value like 'N'
        return ''.join([''.join(row).replace(' ', 'N') for row in state])


class TicTacToeQLearning(TicTacToe):
    def __init__(self, player):
        super().__init__()
        self.player = player
  
        
    def action_to_position(self, action):
	    return divmod(action, 3)
    
    def position_to_action(self, position):
    	return position[0] * 3 + position[1]


    def play(self):
        while True:
            action = self.player.choose_action(self.board)
            i, j = action
            self.board[i][j] = self.player.marker
            reward = -1 if self.check_winner() == self.player_markers[1 - self.player_markers.index(self.player.marker)] else 0
            self.player.update_q_value(reward, self.board)
            if self.check_winner() == self.player.marker:
                self.player.update_q_value(1, self.board)
                break
            elif len(self.empty_cells()) == 0:
                break
            else:
                i, j = self.random_move()
                self.board[i][j] = self.player_markers[1 - self.player_markers.index(self.player.marker)]
                reward = 1

    
    def human_play(self):
        while True:
            print(self.board)
            if self.player.marker == 'X':
                action = int(input("Choose your action (0-8): "))
                i, j = self.action_to_position(action)
                self.board[i][j] = self.player.marker
                if self.check_winner() == self.player.marker:
                    print("You win!")
                    break
                elif len(self.empty_cells()) == 0:
                    print("It's a draw!")
                    break
            if self.empty_cells():  # Only choose an action if there are empty cells
                action = self.player.choose_action(self.board)
                if isinstance(action, tuple):  # Check if the action is a tuple before trying to convert it
                    action = self.position_to_action(action)  # convert AI player's action choice to integer
                i, j = self.action_to_position(action)
                self.board[i][j] = self.player_markers[1 - self.player_markers.index(self.player.marker)]
                if self.check_winner() == self.player_markers[1 - self.player_markers.index(self.player.marker)]:
                    print("QLearningPlayer wins!")
                    break
            elif len(self.empty_cells()) == 0:
                print("It's a draw!")
                break


class TicTacToeGUI:
    def __init__(self, master, player):
        self.master = master
        self.player = player
        self.board = TicTacToe()
        self.buttons = [[None, None, None] for _ in range(3)]
        for i in range(3):
            for j in range(3):
                self.buttons[i][j] = tk.Button(master, command=lambda row=i, col=j: self.make_move(row, col), height=3, width=6)
                self.buttons[i][j].grid(row=i, column=j)

    def make_move(self, row, col):
        if self.board.board[row][col] == ' ':
            self.board.board[row][col] = self.player.marker
            self.buttons[row][col].config(text=self.player.marker)
            winner = self.board.check_winner()
            if winner is not None:
                self.game_over(winner)
            else:
                self.ai_move()

    def ai_move(self):
        action = self.player.choose_action(self.board.board)
        row, col = action
        self.board.board[row][col] = self.board.player_markers[1 - self.board.player_markers.index(self.player.marker)]
        self.buttons[row][col].config(text=self.board.player_markers[1 - self.board.player_markers.index(self.player.marker)])
        winner = self.board.check_winner()
        if winner is not None:
            self.game_over(winner)

    def game_over(self, winner):
        if winner == 'Draw':
            messagebox.showinfo("Game Over", "The game is a draw.")
        else:
            messagebox.showinfo("Game Over", f"The winner is {winner}.")
        self.master.destroy()


def main():
    player = QLearningPlayer(alpha=0.5, gamma=0.6, epsilon=0.6)

    # Train the QLearningPlayer
    num_episodes = 1000000
    for _ in range(num_episodes):
        game = TicTacToeQLearning(player)
        game.play()
        player.epsilon *= 0.999

    # Save the trained player to a file
    with open('trained_player.pkl', 'wb') as f:
        pickle.dump(player, f)

if __name__ == "__main__":
    main()


In [None]:
####################################################################################

In [None]:
# Testing is below... Above code is for training which worked pretty well. 

### Now I will load the pickle file and test the model in GUI.

In [17]:
import tkinter as tk
from tkinter import messagebox
import numpy as np
import pickle

class TicTacToe:
    def __init__(self):
        self.board = [[' ' for _ in range(3)] for _ in range(3)]
        self.player_markers = ['X', 'O']

    def empty_cells(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i][j] == ' ']

    def check_winner(self):
        # Check rows
        for row in self.board:
            if len(set(row)) == 1 and row[0] != ' ':
                return row[0]
        # Check columns
        for col in range(3):
            column = [row[col] for row in self.board]
            if len(set(column)) == 1 and column[0] != ' ':
                return column[0]
        # Check diagonals
        if len(set(self.board[i][i] for i in range(3))) == 1 and self.board[0][0] != ' ':
            return self.board[0][0]
        if len(set(self.board[i][2-i] for i in range(3))) == 1 and self.board[0][2] != ' ':
            return self.board[0][2]
        # No winner
        if ' ' in (cell for row in self.board for cell in row):  # if there's still empty cells, game continues
            return None
        else:
            return 'Draw'  # If no empty cells and no winner, it's a draw


    def random_move(self):
        return self.empty_cells()[np.random.randint(len(self.empty_cells()))]

class QLearningPlayer:
    def __init__(self, alpha=0.5, gamma=0.9, epsilon=0.1, learning_rate=0.02, discount_factor=0.4):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = dict()  # Q-table
        self.state = None
        self.action = None
        self.marker = 'X'  # Player's marker

    def get_q_value(self, state, action):
        return self.q_table.get((self.state_to_str(state), action), 0)

    def choose_action(self, state):
        empty_cells = [(i, j) for i in range(3) for j in range(3) if state[i][j] == ' ']
        if np.random.rand() < self.epsilon:
            # Exploration: choose random action
            action_idx = np.random.randint(len(empty_cells))
        else:
            # Exploitation: choose action with max Q-value
            q_values = [self.get_q_value(state, action) for action in empty_cells]
            action_idx = np.argmax(q_values)
        self.state = state
        self.action = empty_cells[action_idx]
        return self.action



    def update_q_value(self, reward, next_state):
        current_q_value = self.get_q_value(self.state, self.action)
        empty_cells = [(i, j) for i in range(3) for j in range(3) if next_state[i][j] == ' ']
        if empty_cells:  # Check if there are still empty cells
            max_next_q_value = max([self.get_q_value(next_state, action) for action in empty_cells])
        else:
            max_next_q_value = 0
        new_q_value = current_q_value + self.alpha * (reward + self.gamma * max_next_q_value - current_q_value)
        self.q_table[(self.state_to_str(self.state), self.action)] = new_q_value



    def state_to_str(self, state):
        # Change here: replace the ' ' with a neutral value like 'N'
        return ''.join([''.join(row).replace(' ', 'N') for row in state])


class TicTacToeQLearning(TicTacToe):
    def __init__(self, player):
        super().__init__()
        self.player = player
  
        
    def action_to_position(self, action):
	    return divmod(action, 3)
    
    def position_to_action(self, position):
    	return position[0] * 3 + position[1]


    def play(self):
        while True:
            action = self.player.choose_action(self.board)
            i, j = action
            self.board[i][j] = self.player.marker
            reward = -1 if self.check_winner() == self.player_markers[1 - self.player_markers.index(self.player.marker)] else 0
            self.player.update_q_value(reward, self.board)
            if self.check_winner() == self.player.marker:
                self.player.update_q_value(1, self.board)
                break
            elif len(self.empty_cells()) == 0:
                break
            else:
                i, j = self.random_move()
                self.board[i][j] = self.player_markers[1 - self.player_markers.index(self.player.marker)]
                reward = 1

    
    def human_play(self):
        while True:
            print(self.board)
            if self.player.marker == 'X':
                action = int(input("Choose your action (0-8): "))
                i, j = self.action_to_position(action)
                self.board[i][j] = self.player.marker
                if self.check_winner() == self.player.marker:
                    print("You win!")
                    break
                elif len(self.empty_cells()) == 0:
                    print("It's a draw!")
                    break
            if self.empty_cells():  # Only choose an action if there are empty cells
                action = self.player.choose_action(self.board)
                if isinstance(action, tuple):  # Check if the action is a tuple before trying to convert it
                    action = self.position_to_action(action)  # convert AI player's action choice to integer
                i, j = self.action_to_position(action)
                self.board[i][j] = self.player_markers[1 - self.player_markers.index(self.player.marker)]
                if self.check_winner() == self.player_markers[1 - self.player_markers.index(self.player.marker)]:
                    print("QLearningPlayer wins!")
                    break
            elif len(self.empty_cells()) == 0:
                print("It's a draw!")
                break


class TicTacToeGUI:
    def __init__(self, master, player):
        self.master = master
        self.player = player
        self.board = TicTacToe()
        self.buttons = [[None, None, None] for _ in range(3)]
        for i in range(3):
            for j in range(3):
                self.buttons[i][j] = tk.Button(master, command=lambda row=i, col=j: self.make_move(row, col), height=3, width=6)
                self.buttons[i][j].grid(row=i, column=j)

    def make_move(self, row, col):
        if self.board.board[row][col] == ' ':
            self.board.board[row][col] = self.player.marker
            self.buttons[row][col].config(text=self.player.marker)
            winner = self.board.check_winner()
            if winner is not None:
                self.game_over(winner)
            else:
                self.ai_move()

    def ai_move(self):
        action = self.player.choose_action(self.board.board)
        row, col = action
        self.board.board[row][col] = self.board.player_markers[1 - self.board.player_markers.index(self.player.marker)]
        self.buttons[row][col].config(text=self.board.player_markers[1 - self.board.player_markers.index(self.player.marker)])
        winner = self.board.check_winner()
        if winner is not None:
            self.game_over(winner)

    def game_over(self, winner):
        if winner == 'Draw':
            messagebox.showinfo("Game Over", "The game is a draw.")
        else:
            messagebox.showinfo("Game Over", f"The winner is {winner}.")
        self.master.destroy()


def main():
    # player = QLearningPlayer(alpha=0.5, gamma=0.6, epsilon=0.6)
	
    # Load the trained player from file. 
    with open('./trained_player.pkl', 'rb') as f:
        player = pickle.load(f)

    # Switch to exploitation mode
    player.epsilon = 0  

    # Play against human
    root = tk.Tk()
    gui = TicTacToeGUI(root, player)
    root.mainloop()


if __name__ == "__main__":
    main()


In [None]:
### The above code works well

# Ok now I am gonna change the reward function

In [11]:
import tkinter as tk
from tkinter import messagebox
import numpy as np
import pickle

class TicTacToe:
    def __init__(self):
        self.board = [[' ' for _ in range(3)] for _ in range(3)]
        self.player_markers = ['X', 'O']

    def empty_cells(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i][j] == ' ']

    def check_winner(self):
        # Check rows
        for row in self.board:
            if len(set(row)) == 1 and row[0] != ' ':
                return row[0]
        # Check columns
        for col in range(3):
            column = [row[col] for row in self.board]
            if len(set(column)) == 1 and column[0] != ' ':
                return column[0]
        # Check diagonals
        if len(set(self.board[i][i] for i in range(3))) == 1 and self.board[0][0] != ' ':
            return self.board[0][0]
        if len(set(self.board[i][2-i] for i in range(3))) == 1 and self.board[0][2] != ' ':
            return self.board[0][2]
        # No winner
        if ' ' in (cell for row in self.board for cell in row):  # if there's still empty cells, game continues
            return None
        else:
            return 'Draw'  # If no empty cells and no winner, it's a draw


    def random_move(self):
        return self.empty_cells()[np.random.randint(len(self.empty_cells()))]

class QLearningPlayer:
    def __init__(self, alpha=0.5, gamma=0.9, epsilon=0.1, learning_rate=0.02, discount_factor=0.4):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = dict()  # Q-table
        self.state = None
        self.action = None
        self.marker = 'X'  # Player's marker

    def get_q_value(self, state, action):
        return self.q_table.get((self.state_to_str(state), action), 0)

    def choose_action(self, state):
        empty_cells = [(i, j) for i in range(3) for j in range(3) if state[i][j] == ' ']
        if np.random.rand() < self.epsilon:
            # Exploration: choose random action
            action_idx = np.random.randint(len(empty_cells))
        else:
            # Exploitation: choose action with max Q-value
            q_values = [self.get_q_value(state, action) for action in empty_cells]
            action_idx = np.argmax(q_values)
        self.state = state
        self.action = empty_cells[action_idx]
        return self.action

    def update_q_value(self, reward, next_state):
        current_q_value = self.get_q_value(self.state, self.action)
        empty_cells = [(i, j) for i in range(3) for j in range(3) if next_state[i][j] == ' ']
        if empty_cells:  
            max_next_q_value = max([self.get_q_value(next_state, action) for action in empty_cells])
        else:
            max_next_q_value = 0
        new_q_value = current_q_value + self.alpha * (reward + self.gamma * max_next_q_value - current_q_value)
        self.q_table[(self.state_to_str(self.state), self.action)] = new_q_value

    def state_to_str(self, state):
        # Change here: replace the ' ' with a neutral value like 'N'
        return ''.join([''.join(row).replace(' ', 'N') for row in state])


class TicTacToeQLearning(TicTacToe):
    def __init__(self, player):
        super().__init__()
        self.player = player
  
        
    def action_to_position(self, action):
	    return divmod(action, 3)
    
    def position_to_action(self, position):
    	return position[0] * 3 + position[1]

    def play(self):
        while True:
            action = self.player.choose_action(self.board)
            i, j = action
            self.board[i][j] = self.player.marker
            if self.check_winner() == self.player_markers[1 - self.player_markers.index(self.player.marker)]:
                reward = -100
            elif self.check_winner() == self.player.marker:
                reward = 100
            elif len(self.empty_cells()) == 0:
                reward = 0
            else:
                reward = 10
            self.player.update_q_value(reward, self.board)
            if self.check_winner() == self.player.marker:
                reward = 100  # give the player a reward for winning
                self.player.update_q_value(reward, self.board)
                break
            elif len(self.empty_cells()) == 0:
                break
            else:
                i, j = self.random_move()
                self.board[i][j] = self.player_markers[1 - self.player_markers.index(self.player.marker)]
    
    def human_play(self):
        while True:
            print(self.board)
            if self.player.marker == 'X':
                action = int(input("Choose your action (0-8): "))
                i, j = self.action_to_position(action)
                self.board[i][j] = self.player.marker
                if self.check_winner() == self.player.marker:
                    print("You win!")
                    break
                elif len(self.empty_cells()) == 0:
                    print("It's a draw!")
                    break
            if self.empty_cells():  # Only choose an action if there are empty cells
                action = self.player.choose_action(self.board)
                if isinstance(action, tuple):  # Check if the action is a tuple before trying to convert it
                    action = self.position_to_action(action)  # convert AI player's action choice to integer
                i, j = self.action_to_position(action)
                self.board[i][j] = self.player_markers[1 - self.player_markers.index(self.player.marker)]
                if self.check_winner() == self.player_markers[1 - self.player_markers.index(self.player.marker)]:
                    print("QLearningPlayer wins!")
                    break
            elif len(self.empty_cells()) == 0:
                print("It's a draw!")
                break

class TicTacToeGUI:
    def __init__(self, master, player):
        self.master = master
        self.player = player
        self.board = TicTacToe()
        self.buttons = [[None, None, None] for _ in range(3)]
        for i in range(3):
            for j in range(3):
                self.buttons[i][j] = tk.Button(master, command=lambda row=i, col=j: self.make_move(row, col), height=3, width=6)
                self.buttons[i][j].grid(row=i, column=j)

    def make_move(self, row, col):
        if self.board.board[row][col] == ' ':
            self.board.board[row][col] = self.player.marker
            self.buttons[row][col].config(text=self.player.marker)
            winner = self.board.check_winner()
            if winner is not None:
                self.game_over(winner)
            else:
                self.ai_move()

    def ai_move(self):
        action = self.player.choose_action(self.board.board)
        row, col = action
        self.board.board[row][col] = self.board.player_markers[1 - self.board.player_markers.index(self.player.marker)]
        self.buttons[row][col].config(text=self.board.player_markers[1 - self.board.player_markers.index(self.player.marker)])
        winner = self.board.check_winner()
        if winner is not None:
            self.game_over(winner)

    def game_over(self, winner):
        if winner == 'Draw':
            messagebox.showinfo("Game Over", "The game is a draw.")
        else:
            messagebox.showinfo("Game Over", f"The winner is {winner}.")
        self.master.destroy()


def main():
    player = QLearningPlayer(alpha=0.5, gamma=0.6, epsilon=0.8)

    # Train the QLearningPlayer
    num_episodes = 1000000
    decay_rate = 0.9999  # Make the decay slower
    for _ in range(num_episodes):
        game = TicTacToeQLearning(player)
        game.play()
        player.epsilon *= decay_rate

    # Save the trained player to a file
    with open('trained_player.pkl', 'wb') as f:
        pickle.dump(player, f)

if __name__ == "__main__":
    main()
