In [6]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

BOARD_ROWS = 3
BOARD_COLS = 3
BOARD_SIZE = BOARD_ROWS * BOARD_COLS

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.player_symbol = 1
    
    def get_hash(self):
        return str(self.board.reshape(BOARD_SIZE))

    def winner(self):
        for i in range(3):
            if sum(self.board[i, :]) == 3 or sum(self.board[:, i]) == 3: return 1
            if sum(self.board[i, :]) == -3 or sum(self.board[:, i]) == -3: return -1
        diag1 = sum([self.board[i, i] for i in range(3)])
        diag2 = sum([self.board[i, 2 - i] for i in range(3)])
        if diag1 == 3 or diag2 == 3: return 1
        if diag1 == -3 or diag2 == -3: return -1
        if len(self.available_positions()) == 0: return 0
        return None

    def available_positions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def update_state(self, position):
        self.board[position] = self.player_symbol
        self.player_symbol = -1 if self.player_symbol == 1 else 1

    def reset(self):
        self.board = np.zeros((3, 3))
        self.player_symbol = 1

In [7]:
class Agent:
    def __init__(self, exp_rate=0.3):
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {} 
        self.states = []

    def choose_action(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            return positions[np.random.choice(len(positions))]
        
        value_max = -999
        action = positions[0]
        for p in positions:
            nb = current_board.copy()
            nb[p] = symbol
            h = str(nb.reshape(9))
            value = self.states_value.get(h, 0)
            if value >= value_max:
                value_max, action = value, p
        return action

    def feed_reward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None: self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

In [8]:
def run_experiment(eps, total_runs=100, steps=1000):
    # This will store the win result (1 or 0) for every step, averaged across all runs
    aggregated_results = np.zeros(steps)
    
    print(f"Running {total_runs} independent experiments for epsilon={eps}...")
    for r in tqdm(range(total_runs)):
        p1 = Agent(exp_rate=eps)
        p2 = Agent(exp_rate=0.1) # Standard opponent
        env = TicTacToe()
        
        for s in range(steps):
            winner = None
            while winner is None:
                # P1 Turn
                act1 = p1.choose_action(env.available_positions(), env.board, 1)
                env.update_state(act1)
                p1.states.append(env.get_hash())
                winner = env.winner()
                if winner is not None:
                    p1.feed_reward(1 if winner == 1 else 0.1 if winner == 0 else 0)
                    break
                
                # P2 Turn
                act2 = p2.choose_action(env.available_positions(), env.board, -1)
                env.update_state(act2)
                winner = env.winner()
                if winner is not None:
                    p1.feed_reward(0 if winner == -1 else 0.1 if winner == 0 else 1)
            
            if winner == 1:
                aggregated_results[s] += 1
            
            env.reset()
            p1.states = []
            
    return aggregated_results / total_runs

In [None]:
# Settings: 100 independent runs, 5000 steps each
RUNS = 100 
STEPS = 5000

res_greedy = run_experiment(0.0, total_runs=RUNS, steps=STEPS)
res_eps_01 = run_experiment(0.1, total_runs=RUNS, steps=STEPS)
res_eps_03 = run_experiment(0.3, total_runs=RUNS, steps=STEPS)

plt.figure(figsize=(12, 6))
plt.plot(res_greedy, label='$\epsilon = 0$ (greedy)', color='gray')
plt.plot(res_eps_01, label='$\epsilon = 0.1$', color='green')
plt.plot(res_eps_03, label='$\epsilon = 0.3$', color='red')

plt.xlabel('Steps (Episodes)')
plt.ylabel('Average Reward (Win Probability)')
plt.title('10-Armed Testbed Logic applied to Tic-Tac-Toe (Ensemble Average)')
plt.legend()
plt.grid(True, alpha=0.2)
plt.show()

  plt.plot(res_greedy, label='$\epsilon = 0$ (greedy)', color='gray')
  plt.plot(res_eps_01, label='$\epsilon = 0.1$', color='green')
  plt.plot(res_eps_03, label='$\epsilon = 0.3$', color='red')


Running 100 independent experiments for epsilon=0.0...


100%|██████████| 100/100 [37:49<00:00, 22.70s/it]


Running 100 independent experiments for epsilon=0.1...


 33%|███▎      | 33/100 [15:41<43:47, 39.21s/it]