# Part 1: Developing an agent to play Connect-4 using Tabular Q-Learning

### Note that this is using only default random rollouts (pre-objective 1).

In [None]:
!pip install tensorflow

In [1]:
# Define a class to handle reading a game configuration from a file
class File_Reader:
    # Constructor initializes the object with a path to the file
    def __init__(self, file_path):
        self.file_path = file_path  # Store the file path as an instance variable

    # Method to read the contents of the file and parse them
    def read_file(self):
        try:
            # Open the file in read mode
            with open(self.file_path, 'r') as file:
                lines = file.readlines()  # Read all lines into a list

                # First line contains the algorithm type (e.g., UR, UCT, Q-Learning, Deep Q-Learning)
                algorithm_type = lines[0].strip()

                # Second line contains the player's color (e.g., Red, Yello)
                player_color = lines[1].strip()

                # The next 6 lines represent the game board (6 rows)
                board_lines = lines[2:8]

                # Strip each line and convert it into a list of characters
                board = [list(row.strip()) for row in board_lines]

                # Return the parsed data as a tuple
                return algorithm_type, player_color, board

        # Handle case where the file does not exist
        except FileNotFoundError:
            print(f"File {self.file_path} not found.")
            return None

        # Handle other unforeseen exceptions
        except Exception as e:
            print(f"An error occurred: {e}")
            return None


In [16]:
# Constants defining the game dimensions
ROWS = 6          # Number of rows in the board
COLUMNS = 7       # Number of columns in the board

# Mapping symbols to integers for internal representation
SYMBOLS = {
    'o': 0,        # Empty cell
    'r': 1,        # Red player
    'y': 2,        # Yellow player
    'red': 1,      # Alternative red input
    'yellow': 2    # Alternative yellow input
}

# Reverse mapping from integers to display symbols
INT_TO_SYMBOL = {
    0: 'O',        # Empty
    1: 'R',        # Red
    2: 'Y'         # Yellow
}

# Q-table for reinforcement learning (can store state-action values)
Q_table = {}

# Class representing the game board
class Board:
    def __init__(self, rows=6, cols=7):
        self.rows = rows
        self.cols = cols
        # Initialize the board with 'O' to represent empty cells
        self.board = [['O' for _ in range(cols)] for _ in range(rows)]

    def reset(self):
        """Reset the board to an empty state."""
        self.board = [['O' for _ in range(self.cols)] for _ in range(self.rows)]

    def copy(self):
        """Return a deep copy of the board."""
        new_board = Board(self.rows, self.cols)
        # Deep copy each row to avoid shared references
        new_board.board = [row.copy() for row in self.board]
        return new_board

    def PrintBoard(self):
        """Print the current state of the board in a readable format."""
        for row in self.board:
            print('|'.join(row))  # Use pipe character to separate columns
        print('-' * (self.cols * 2 - 1))  # Print separator line below board

    def AvailableColumns(self):
        """Return a list of column indices that are not full (can accept a move)."""
        return [col for col in range(self.cols) if self.board[0][col] == 'O']

    def AvailableRowInColumn(self, column):
        """Return the first available row index from the bottom in the given column."""
        for row in reversed(range(self.rows)):  # Check from bottom to top
            if self.board[row][column] == 'O':
                return row
        return -1  # Return -1 if the column is full

    def CheckWin(self, player):
        """Check if the specified player ('R' or 'Y') has won the game."""

        # Check horizontal sequences
        for r in range(self.rows):
            for c in range(self.cols - 3):
                if all(self.board[r][c + i] == player for i in range(4)):
                    return True

        # Check vertical sequences
        for r in range(self.rows - 3):
            for c in range(self.cols):
                if all(self.board[r + i][c] == player for i in range(4)):
                    return True

        # Check diagonal from bottom-left to top-right
        for r in range(self.rows - 3):
            for c in range(self.cols - 3):
                if all(self.board[r + i][c + i] == player for i in range(4)):
                    return True

        # Check diagonal from top-left to bottom-right
        for r in range(3, self.rows):
            for c in range(self.cols - 3):
                if all(self.board[r - i][c + i] == player for i in range(4)):
                    return True

        return False  # No win found

    def StateToKey(self):
        """
        Convert the current board state into a hashable tuple of tuples,
        using integer symbols, suitable for use as keys in a dictionary.
        """
        return tuple(tuple(SYMBOLS.get(cell.lower(), 0) for cell in row) for row in self.board)

In [6]:
# Import NumPy for numerical operations (used here for random number generation)
import numpy as np  # type: ignore

# Dummy reinforcement learning policy for testing or placeholder purposes
class Dummy_RL_Policy:
    def predict(self, board):
        """
        Generate a dummy prediction for the given board.

        Returns a NumPy array of random values (one for each column),
        simulating the output of a policy that rates each possible move.
        """
        return np.random.rand(board.cols)  # Random values between 0 and 1 for each column


In [5]:
# Import necessary modules
import random          # For random move selection
import copy            # For deep copying board states
import math            # For exploration/exploitation calculations

# Class representing a node in the Monte Carlo Tree Search using UCT (Upper Confidence Bound applied to Trees)
class UCT_Node:
    def __init__(self, board, player, parent=None):
        self.board = copy.deepcopy(board)       # Deep copy of the game board to maintain state
        self.player = player                    # Player associated with this node ('R' or 'Y')
        self.parent = parent                    # Reference to the parent node (None for root)
        self.children = {}                      # Dictionary mapping moves to child nodes
        self.visits = 0                         # Number of times this node has been visited
        self.q_value = 0                        # Total accumulated reward (win/loss signal)
        self.untried_moves = board.AvailableColumns()  # Moves not yet explored from this node
        self.q_values = None                    # Placeholder for storing values from an RL policy (optional)

    def expand(self, rl_policy):
        """
        Expand the current node by randomly selecting one of the untried moves.
        Create a new child node corresponding to the move and assign predicted Q-values.
        """
        move = random.choice(self.untried_moves)       # Randomly choose an unexplored move
        new_board = copy.deepcopy(self.board)          # Create a new board for the child node
        self.board.AvailableRowInColumn(move)          # (Seems unused here — may be a mistake or missing logic)
        next_player = 'Y' if self.player == 'R' else 'R'  # Switch to the other player
        child_node = UCT_Node(new_board, next_player, parent=self)  # Create child node
        child_node.q_values = rl_policy.predict(new_board)          # Get Q-values from RL policy
        self.children[move] = child_node                # Link move to the child node
        self.untried_moves.remove(move)                 # Mark move as tried
        return child_node

    def best_child(self, c_param=1.4):
        """
        Select the best child node using the UCT formula:
        score = (exploitation term) + c_param * (exploration term)
        """
        choices = []
        for move, child in self.children.items():
            exploit = child.q_value / (child.visits + 1e-8)  # Average reward
            explore = math.sqrt(math.log(self.visits + 1) / (child.visits + 1e-8))  # UCB exploration term
            score = exploit + c_param * explore
            choices.append((score, move, child))
        _, move, best = max(choices)  # Choose the child with the highest score
        return best

    def is_fully_expanded(self):
        """Return True if all possible moves from this node have been explored."""
        return len(self.untried_moves) == 0

    def is_terminal(self):
        """
        Return True if the game is over: either player has won or no more moves are available.
        """
        return self.board.CheckWin('R') or self.board.CheckWin('Y') or not self.board.AvailableColumns()

In [7]:
# from uct_node import UCT_Node
import random
import numpy as np  # type: ignore
import copy

# Class representing the UCT search tree for Monte Carlo Tree Search (MCTS)
class uct_tree:
    def __init__(self, player, rl_policy, board, num_simulations=1000):
        self.player = player                            # Player for whom the tree is searching moves ('R' or 'Y')
        self.rl_policy = rl_policy                      # Reinforcement learning policy used to guide rollouts
        self.num_simulations = num_simulations          # Number of simulations to run for MCTS
        self.exploration_weight = 1.0                   # UCT exploration constant (not used directly here)

        self.board = board                              # Store the initial board

        # Create the root node using the current board and player
        self.root = UCT_Node(self.board, self.player)
        self.root.q_values = rl_policy.predict(self.board)  # Initialize root with Q-values from the RL policy

    def search(self, board):
        # Initialize a new root node for the current board state
        root = UCT_Node(board, self.player)

        # Check if the game is already over (no moves available)
        if not board.AvailableColumns():
            print("Game over: Board is full.")
            return None  # Could return a specific "game over" signal if needed

        # Run simulations to build the tree and evaluate moves
        for _ in range(self.num_simulations):
            node_to_expand = self.select_node(root)           # Traverse the tree to select a node to expand
            self.expand_node(node_to_expand)                  # Expand that node by adding a child
            reward = self.rollout(node_to_expand.board, self.player)  # Simulate a game from that node
            self.backpropagate(node_to_expand, reward)        # Propagate the result back up the tree

        # Choose the best move based on the most visited child
        if not root.children:
            print("Warning: No children in root node, returning a random move.")
            return random.choice(board.AvailableColumns())  # Fallback in case of no expanded children

        # Return the move that led to the child with the most visits
        best_move = max(root.children.items(), key=lambda item: item[1].visits)[0]
        return best_move

    def select_node(self, node):
        """
        Traverse the tree from the given node, selecting best children until an expandable or terminal node is found.
        """
        while not node.is_terminal():
            if not node.is_fully_expanded():
                return node.expand(self.rl_policy)  # Expand if possible
            else:
                node = node.best_child()            # Otherwise keep selecting the best child
        return node

    def expand_node(self, node):
        """
        Expands the given node by one child if not fully expanded.
        """
        if not node.is_fully_expanded():
            return node.expand(self.rl_policy)
        return None

    def rollout(self, board, player):
        """
        Simulate a random game from the current board state using the RL policy to select moves.
        Returns +1 if self.player wins, -1 if they lose, and 0 for a draw.
        """
        rollout_board = copy.deepcopy(board)
        turn = player

        while True:
            # Check if someone has won
            if rollout_board.CheckWin('R'):
                return 1 if self.player == 'R' else -1
            if rollout_board.CheckWin('Y'):
                return 1 if self.player == 'Y' else -1

            available_moves = rollout_board.AvailableColumns()
            if not available_moves:
                return 0  # Draw if no more moves

            # Use RL policy to predict Q-values and choose the best move
            q_values = self.rl_policy.predict(rollout_board)
            best_move = np.argmax(q_values)

            # Determine the row to place the piece
            row = rollout_board.AvailableRowInColumn(best_move)
            if row != -1:  # Ensure valid row (i.e., column not full)
                rollout_board.board[row][best_move] = 'R' if turn == 'R' else 'Y'

            # Switch turn
            turn = 'Y' if turn == 'R' else 'R'

    def backpropagate(self, node, reward):
        """
        Propagate the result of the rollout back through the path of selected nodes.
        Alternate the reward (negate it) as it switches players.
        """
        while node is not None:
            node.visits += 1
            node.q_value += reward
            reward = -reward  # Switch perspective for the opposing player
            node = node.parent


In [8]:
# from board import *  # Assuming necessary imports for board functionality
import random

class Uniform_Random:
    @staticmethod
    def UniformRandom(player, board, output_type):
        """
        Simulates a game where players make random moves until the game ends.
        Arguments:
            player: Starting player ('R' or 'Y')
            board: Board object representing the game state
            output_type: Controls output verbosity ('verbose', 'none', etc.)
        """
        opponent = 'Y' if player == 'R' else 'R'  # Determine the opponent
        turn = player                             # Initialize whose turn it is
        done = False                              # Game over flag

        while not done:
            if output_type.lower() == "verbose":
                print(f"Current board:")
                board.PrintBoard()               # Print board before the move (if verbose)

            available_columns = board.AvailableColumns()  # Get list of valid columns to move in

            if not available_columns:
                print("Game over: Board is full.")  # No moves left = draw
                break

            selected_column = random.choice(available_columns)  # Randomly choose a valid column
            row = board.AvailableRowInColumn(selected_column)   # Get the row to drop the piece in
            board.board[row][selected_column] = turn            # Place the piece on the board

            print(f"Move selected: {selected_column + 1}\n")     # Announce move (1-based column index)

            if output_type.lower() != "none":
                print(f"Updated board:")
                board.PrintBoard()            # Show updated board (if output enabled)

            # Check if the current player has won after the move
            if board.CheckWin(turn):
                print(f"Game Over! {turn} Player wins.")  # Announce winner
                done = True
            elif not board.AvailableColumns():             # Check if board is full after the move
                print("Game Over! It's a draw.")           # Announce draw
                done = True
            else:
                turn = opponent if turn == player else player  # Alternate between player and opponent

In [11]:
import random
from collections import deque
import numpy as np  # type: ignore
import matplotlib.pyplot as plt  # type: ignore
from tensorflow.keras.models import Sequential  # type: ignore
from tensorflow.keras.layers import Dense, Input  # type: ignore
from tensorflow.keras.optimizers import Adam  # type: ignore
# from board import *  # Assumes Board, ROWS, and COLUMNS are defined in board.py

# Mapping symbols for the players
SYMBOLS = {'r': 1, 'y': 2, 'red': 1, 'yellow': 2, 'R': 1, 'Y': 2}
INT_TO_SYMBOL = {0: 'O', 1: 'R', 2: 'Y'}

class DQNAgent:
    """
    DQNAgent: A Deep Q-Network agent for training an AI to play a game like Connect4.
    """
    def __init__(self, state_size, action_size):
        """
        Initializes the DQN agent.
        Args:
            state_size: The size of the state (number of cells in the board).
            action_size: The number of actions (columns in the game).
        """
        self.state_size = state_size  # Size of the state space
        self.action_size = action_size  # Number of possible actions (columns)
        self.memory = deque(maxlen=2000)  # Memory buffer for experience replay
        self.gamma = 0.95  # Discount factor for future rewards
        self.epsilon = 1.0  # Exploration rate (initially fully random)
        self.epsilon_min = 0.01  # Minimum exploration rate
        self.epsilon_decay = 0.995  # Decay factor for epsilon
        self.learning_rate = 0.001  # Learning rate for the optimizer
        self.model = self._build_model()  # Build the neural network model

    def _build_model(self):
        """
        Builds the DQN model (a simple fully connected neural network).
        Returns:
            model: A compiled Keras model.
        """
        model = Sequential()
        model.add(Input(shape=(self.state_size,)))  # Input layer with state size as input
        model.add(Dense(24, activation='relu'))  # Hidden layer with 24 units and ReLU activation
        model.add(Dense(24, activation='relu'))  # Another hidden layer
        model.add(Dense(self.action_size, activation='linear'))  # Output layer with one unit per action
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))  # Compile the model with MSE loss and Adam optimizer
        return model

    def remember(self, state, action, reward, next_state, done):
        """
        Stores experiences in memory for future learning.
        Args:
            state: The current state.
            action: The action taken.
            reward: The reward received for the action.
            next_state: The resulting state after the action.
            done: Whether the game is over (True/False).
        """
        self.memory.append((state, action, reward, next_state, done))  # Store the experience

    def act(self, state):
        """
        Chooses an action based on the current state using an epsilon-greedy strategy.
        Args:
            state: The current state.
        Returns:
            action: The selected action (column).
        """
        if np.random.rand() <= self.epsilon:  # Exploration
            return random.randrange(self.action_size)  # Select a random action
        act_values = self.model.predict(state, verbose=0)  # Predict Q-values for the current state
        return np.argmax(act_values[0])  # Select the action with the highest Q-value

    def replay(self, batch_size):
        """
        Trains the model using a batch of experiences from memory.
        Args:
            batch_size: The number of experiences to sample for training.
        """
        if len(self.memory) < batch_size:  # Ensure enough samples to replay
            return
        minibatch = random.sample(self.memory, batch_size)  # Sample a batch from memory
        for state, action, reward, next_state, done in minibatch:
            target = reward  # Initialize the target with the immediate reward
            if not done:
                target += self.gamma * np.amax(self.model.predict(next_state, verbose=0)[0])  # Add future rewards if not done
            target_f = self.model.predict(state, verbose=0)  # Get the current Q-values for the state
            target_f[0][action] = target  # Update the Q-value for the taken action
            self.model.fit(state, target_f, epochs=1, verbose=0)  # Train the model for one epoch

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay  # Decay epsilon for less exploration over time

    def load(self, name):
        """
        Loads model weights from a file.
        Args:
            name: The file name to load weights from.
        """
        self.model.load_weights(name)

    def save(self, name):
        """
        Saves model weights to a file.
        Args:
            name: The file name to save weights to.
        """
        self.model.save_weights(name)


def TrainDQNAgent(player_color, EPISODES, board, make_plot=True):
    """
    Trains a DQN agent to play the game.
    Args:
        player_color: The color of the player (used to map to 'R' or 'Y').
        EPISODES: The number of training episodes.
        board: The board object representing the game.
        make_plot: Whether to generate training reward and epsilon plots.
    Returns:
        agent: The trained DQN agent.
        rewards: The rewards obtained in each episode.
        epsilon_values: The epsilon values during training.
    """
    state_size = ROWS * COLUMNS  # Total number of cells on the board (flattened)
    action_size = COLUMNS  # Number of possible actions (columns)
    agent = DQNAgent(state_size, action_size)  # Create the DQN agent
    batch_size = 32  # Mini-batch size for replay
    rewards = []  # List to store rewards during training
    epsilon_values = []  # List to store epsilon values during training

    player_color = player_color.lower()  # Convert color to lowercase
    if player_color not in SYMBOLS:
        raise ValueError(f"Invalid player color: {player_color}. Valid colors: 'R', 'Y', 'red', 'yellow'.")
    player = SYMBOLS[player_color]  # Map player color to symbol

    for e in range(EPISODES):  # Loop through episodes
        current_board = board.copy()  # Reset the board at the start of each episode
        total_reward = 0  # Track total reward for the episode
        done = False  # Track if the game is done

        while not done:
            state = np.reshape(current_board.StateToKey(), [1, state_size])  # Reshape board state for model input
            action = agent.act(state)  # Choose an action based on the current state

            # Check if the move is valid (column is not full)
            if current_board.board[0][action] != 'O':
                reward = -10  # Invalid move, penalize
                done = True
            else:
                row = current_board.AvailableRowInColumn(action)  # Find the row to drop the piece
                if row == -1:
                    reward = -10  # Invalid move, penalize
                    done = True
                else:
                    current_board.board[row][action] = INT_TO_SYMBOL[player]  # Update the board with the player's move
                    if current_board.CheckWin(INT_TO_SYMBOL[player]):  # Check if player wins
                        reward = 10  # Win, give positive reward
                        done = True
                    else:
                        avail_cols = current_board.AvailableColumns()  # Check available columns
                        if not avail_cols:  # If no more moves left, game is a draw
                            reward = 0
                            done = True
                        else:
                            opp_action = random.choice(avail_cols)  # Random move for opponent
                            opp_row = current_board.AvailableRowInColumn(opp_action)
                            current_board.board[opp_row][opp_action] = INT_TO_SYMBOL[3 - player]  # Opponent's move
                            if current_board.CheckWin(INT_TO_SYMBOL[3 - player]):  # Check if opponent wins
                                reward = -10  # Loss, penalize
                                done = True
                            else:
                                reward = 0  # No winner, continue the game

            next_state = np.reshape(current_board.StateToKey(), [1, state_size])  # Get the new state after the move
            agent.remember(state, action, reward, next_state, done)  # Store the experience in memory
            total_reward += reward  # Add the reward to the total reward

            if done:
                break

            agent.replay(batch_size)  # Train the agent with experiences from memory

        rewards.append(total_reward)  # Store the reward for the episode
        epsilon_values.append(agent.epsilon)  # Store the epsilon value for the episode
        print(f"Episode: {e + 1}/{EPISODES}, Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

    # Optionally plot the training progress
    if make_plot:
        plt.plot(rewards)
        plt.ylabel('Reward')
        plt.xlabel('Episode')
        plt.title('DQN Agent Training Rewards')
        plt.show()

        plt.plot(epsilon_values)
        plt.ylabel('Epsilon')
        plt.xlabel('Episode')
        plt.title('DQN Agent Epsilon Over Time')
        plt.show()

    return agent, rewards, epsilon_values


# Example: Initialize and print model summary
if __name__ == "__main__":
    dummy_board = Board()  # Create a dummy board object
    agent = DQNAgent(state_size=ROWS * COLUMNS, action_size=COLUMNS)  # Initialize the DQN agent
    agent.model.summary()  # Print model summary


In [12]:
import random
import matplotlib.pyplot as plt # type: ignore
from IPython.display import clear_output  # For live updates in Jupyter environments
# from board import *  # Assuming Board is defined in board.py

class QAgent:
    def __init__(self, Q_table=None, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
        """
        Initialize Q-learning agent with optional Q-table and hyperparameters.
        """
        if Q_table is not None and not isinstance(Q_table, dict):
            raise TypeError("Q_table must be a dictionary or None")  # Ensure Q_table is a dictionary or None
        self.Q_table = Q_table if Q_table else {}  # Initialize Q-table, if not provided
        self.learning_rate = learning_rate  # Learning rate (alpha)
        self.discount_factor = discount_factor  # Discount factor (gamma)
        self.epsilon = epsilon  # Epsilon (for epsilon-greedy policy)

    def StateToKey(self, board):
        """
        Convert the current board state to a string key for Q-table lookup.
        """
        return ''.join(''.join(row) for row in board.board)  # Join rows to create a unique state key

    def QLearningMove(self, player, board):
        """
        Choose the next move using epsilon-greedy policy, based on Q-values.
        """
        state_key = self.StateToKey(board)  # Convert current board to state key
        available_columns = board.AvailableColumns()  # Get available columns to play

        # Initialize Q-table entries for the state if not already present
        if state_key not in self.Q_table:
            self.Q_table[state_key] = {col: 0.0 for col in available_columns}

        # Select action based on epsilon-greedy strategy
        selected_column = random.choice(available_columns) if random.random() < self.epsilon else \
                        max(available_columns, key=lambda col: self.Q_table[state_key].get(col, 0.0))

        # Get the row where the piece should be placed in the selected column
        row = board.AvailableRowInColumn(selected_column)

        # Apply the move to the board if valid
        if row != -1:
            next_board = board.copy()  # Make a copy of the board to simulate the move
            next_board.board[row][selected_column] = player  # Place the player's piece
        else:
            next_board = board  # If no valid row, return the original board

        return next_board, selected_column  # Return updated board and chosen column

    def TrainQLearning(self, player, num_episode, board, num_simulations=1, output_type="verbose"):
        """
        Train the Q-learning agent over multiple episodes.
        """
        print("Training QLearning...")
        opponent = 'Y' if player == 'R' else 'R'  # Set the opponent color
        win_rates = []  # To track win rates during training

        for num_episode in range(num_simulations):
            board.reset()  # Reset the board at the start of each simulation
            done, turn = False, player  # Initialize game status and turn
            history, reward = [], 0  # Initialize history and reward tracker

            while not done:
                state_key = self.StateToKey(board)  # Get state key for the current board
                available_columns = board.AvailableColumns()  # Get available columns to play
                if not available_columns:
                    break  # If no available columns, exit the game

                # Select move based on epsilon-greedy strategy (player's move)
                next_board, action = self.QLearningMove(turn, board) if turn == player else (
                    board.copy(), random.choice(available_columns))

                next_state_key = self.StateToKey(board)  # Get the next state key after the move
                history.append((state_key, action, next_state_key, reward))  # Track state-action transitions

                board.board = next_board.board  # Update board with the new move

                # Print board if output type is verbose
                if output_type == "verbose":
                    print(f"After {turn}'s move (column {action + 1}):")
                    board.PrintBoard()

                # Check win/loss conditions
                if board.CheckWin(player):  # If player wins
                    reward, done = 1, True
                elif board.CheckWin(opponent):  # If opponent wins
                    reward, done = -1, True
                elif not available_columns:  # If it's a draw (no available columns)
                    reward, done = 0, True

                # Switch turns
                turn = opponent if turn == player else player

            # Backpropagate the reward through the state-action history
            for state, action, next_state, reward in reversed(history):
                self.Q_table.setdefault(state, {})
                self.Q_table[state].setdefault(action, 0.0)
                self.Q_table.setdefault(next_state, {})

                # Update Q-value using the Bellman equation
                max_future = max(self.Q_table[next_state].values(), default=0)
                self.Q_table[state][action] += self.learning_rate * (reward + self.discount_factor * max_future - self.Q_table[state][action])
                reward *= self.discount_factor  # Apply discount to the reward

            # Print and plot win rates every 50 episodes
            if num_episode % 50 == 0:
                q_learning_win_rate = self.EvaluateAgent(player)  # Evaluate win rate
                win_rates.append((num_episode, q_learning_win_rate, 0.5))  # Track win rates
                self.PlotLearningCurve(win_rates, live_update=True)  # Plot learning curve
                print(f"Episode {num_episode}, Q-learning Win rate: {q_learning_win_rate:.2f}")

        print("Training completed.")
        self.PlotFinalResults(win_rates)  # Plot final results after training
        return win_rates

    def EvaluateAgent(self, player, num_games=20):
        """
        Evaluate the agent's performance against a random agent.
        """
        wins = 0  # Track number of wins for the agent
        opponent = 'Y' if player == 'R' else 'R'  # Set the opponent color

        for _ in range(num_games):
            board = Board()  # Initialize a new game board
            turn = player
            done = False

            while not done:
                available_columns = board.AvailableColumns()  # Get available columns to play
                if not available_columns:
                    break  # If no available columns, end the game

                # Player's move (QLearning)
                if turn == player:
                    _, action = self.QLearningMove(player, board)
                    row = board.AvailableRowInColumn(action)
                    if row != -1:
                        board.board[row][action] = player  # Make the move for the player
                else:  # Opponent's move (random selection)
                    action = random.choice(available_columns)
                    row = board.AvailableRowInColumn(action)
                    if row != -1:
                        board.board[row][action] = opponent  # Make the move for the opponent

                # Check for win/loss conditions
                if board.CheckWin(player):  # If player wins
                    wins += 1
                    break
                elif board.CheckWin(opponent) or not board.AvailableColumns():  # If opponent wins or draw
                    break

                # Switch turns
                turn = opponent if turn == player else player

        return wins / num_games  # Return win rate based on evaluation

    def PlotLearningCurve(self, win_rates, live_update=False):
        """
        Plot the learning curve of the agent's performance over time.
        """
        episodes = [entry[0] for entry in win_rates]  # Extract episodes from win rates
        q_learning_win_rates = [entry[1] for entry in win_rates]  # Extract Q-learning win rates
        random_win_rates = [entry[2] for entry in win_rates]  # Extract random agent win rates

        plt.figure(figsize=(8,5))
        plt.plot(episodes, q_learning_win_rates, marker='o', color='red', label='Q-learning Agent')
        plt.plot(episodes, random_win_rates, color='blue', label='Random Agent')
        plt.title('Win Rates over Training')
        plt.xlabel('Episode')
        plt.ylabel('Win Rate')
        plt.legend()
        plt.grid(True)

        # If live update is enabled, clear and update the plot in real-time
        if live_update:
            clear_output(wait=True)
            plt.show()
        else:
            plt.show()

    def PlotFinalResults(self, win_rates):
        """
        Plot the final results of the agent's performance after training.
        """
        q_learning_win_rate = self.EvaluateAgent('R')  # Evaluate Q-learning agent
        random_win_rate = self.EvaluateAgent('Y')  # Evaluate random agent
        win_rates.append((len(win_rates), q_learning_win_rate, random_win_rate))  # Append final results
        print(f"Final Q-learning Win rate: {q_learning_win_rate:.2f}")
        print(f"Final Random Win rate: {random_win_rate:.2f}")
        self.PlotLearningCurve(win_rates, live_update=False)  # Plot final learning curve

In [None]:
import sys
# from file_reader import File_Reader
# from board import Board
# from uct_tree import uct_tree
# from uniform_random import Uniform_Random
# from dummy_rl_policy import Dummy_RL_Policy
# from q_agent import QAgent
#from uct_node import UCT_Node
import numpy as np  # type: ignore
import matplotlib.pyplot as plt  # type: ignore
# from dqn_agent import *

class Main:
    SYMBOLS = {'B': 1, 'R': 2}  # Mapping for player symbols (B for Black, R for Red)
    num_simulations = 5  # Default number of simulations for certain algorithms
    game_loop = True  # Flag to control whether the game loop continues

    def main(self):
        print("Main class initialized")

        # Initialize file reader and read file content
        self.file_reader = File_Reader("test.txt")  # Create an instance of File_Reader
        algorithm_type, player_color, board_data = self.file_reader.read_file()  # Read data from file
        print(f"Algorithm Type: {algorithm_type}")
        print(f"Player Color: {player_color}")
        print(f"Board Data: {board_data}")

        # Validate board data format
        if not isinstance(board_data, list) or not all(isinstance(row, list) for row in board_data) or not all(isinstance(cell, str) for row in board_data for cell in row):
            raise TypeError("Board data is not in the correct 2D list format!")  # Raise error if data is not in proper format

        # Initialize the board and set it to the board data from the file
        self.board = Board(len(board_data), len(board_data[0]))  # Create a new Board object based on the board size
        self.board.board = board_data  # Set the board data
        self.board.PrintBoard()  # Print the initial board

        # Start the game loop
        while self.game_loop:
            algorithm_type = input("Enter algorithm type (UR, UCT, QL, DQN, C to quit): ").strip()  # Prompt user for algorithm choice

            # Validate the input for the algorithm type
            if algorithm_type not in ["UR", "UCT", "QL", "DQN", "C"]:
                print(f"Invalid input: {algorithm_type}. Please try again.")  # Invalid input, prompt again
                continue

            if algorithm_type == "C":
                print("Exiting the game.")  # Exit the game loop if 'C' is chosen
                break

            print(f"\n--- Running {algorithm_type} Algorithm ---")

            # Implement different game algorithms based on user choice

            if algorithm_type == "UR":  # Uniform Random Algorithm
                Uniform_Random.UniformRandom(player_color, self.board, output_type="verbose")  # Call Uniform Random algorithm to make a move

            elif algorithm_type == "UCT":  # Upper Confidence Bound for Trees (UCT) Algorithm
                rl_policy = Dummy_RL_Policy()  # Placeholder policy
                uct = uct_tree(player_color, rl_policy, self.board, num_simulations=self.num_simulations)  # Initialize UCT tree with given parameters
                move = uct.search(self.board)  # Perform UCT search to find best move
                row = self.board.AvailableRowInColumn(move)  # Get the row to place the piece in the selected column
                if row != -1:
                    self.board.board[row][move] = player_color  # Update the board with the move
                print(f"UCT selected column: {move + 1}")

            elif algorithm_type == "QL":  # Q-Learning Algorithm
                rl_policy = Dummy_RL_Policy()  # Placeholder policy
                q_agent = QAgent()  # Initialize Q-agent for training
                q_agent.TrainQLearning(player_color, 1, self.board, num_simulations=self.num_simulations, output_type="verbose")  # Train Q-agent
                _, selected_column = q_agent.QLearningMove(player_color, self.board)  # Get the column to move based on trained Q-values
                row = self.board.AvailableRowInColumn(selected_column)  # Get the row to place the piece in the selected column
                if row != -1:
                    self.board.board[row][selected_column] = player_color  # Update the board with the move
                print(f"Q-Learning selected column: {selected_column + 1}")

            elif algorithm_type == "DQN":  # Deep Q-Network Algorithm
                print(f"Training DQN Agent for {self.num_simulations} episodes...")  # Print training message
                agent, rewards, epsilon_values = TrainDQNAgent(player_color, self.num_simulations, self.board)  # Train DQN agent
                state = np.reshape(self.board.board, [1, len(self.board.board) * len(self.board.board[0])])  # Reshape the board state
                action = agent.act(state)  # Get the action (column) based on the trained DQN agent
                row = self.board.AvailableRowInColumn(action)  # Get the row to place the piece in the selected column
                player_num = self.SYMBOLS[player_color]  # Get the player number based on the color
                if row != -1:
                    self.board.board[row][action] = player_num  # Update the board with the move
                print(f"DQN selected column: {action + 1}")

                plt.show()  # Display the learning curve

            # Print updated board after move
            print("Updated board:")
            self.board.PrintBoard()

            # Check if the current player has won or if the game is a draw
            if self.board.CheckWin(player_color):
                print(f"Game Over! {player_color} Player wins.")  # Player wins
                self.game_loop = False  # End the game
            elif not self.board.AvailableColumns():  # Check if there are no available moves left
                print("Game Over! It's a draw.")  # The game is a draw
                self.game_loop = False  # End the game

# Run the main function
if __name__ == "__main__":
    runner = Main()  # Create an instance of the Main class
    runner.main()  # Call the main method to start the game loop