In [1]:
import numpy as np
import random
import pickle  # For saving value function


In [3]:

# Tic-Tac-Toe environment
class TicTacToe:
    def __init__(self):
        self.board = np.zeros(9, dtype=int)  # 3x3 board as a flat array
        self.current_player = 1  # 1 for 'X', -1 for 'O'
    
    def reset(self):
        self.board[:] = 0
        self.current_player = 1
        return tuple(self.board)  # Return state

    def available_actions(self):
        return [i for i in range(9) if self.board[i] == 0]

    def step(self, action):
        """ Apply action and switch player """
        if self.board[action] != 0:
            raise ValueError("Invalid Move")
        
        self.board[action] = self.current_player
        reward, done = self.check_winner()
        self.current_player *= -1  # Switch player
        return tuple(self.board), reward, done  # New state, reward, game over?

    def check_winner(self):
        """ Check if game is won, lost, or draw """
        winning_combos = [(0,1,2), (3,4,5), (6,7,8), (0,3,6), (1,4,7), (2,5,8), (0,4,8), (2,4,6)]
        
        for (i, j, k) in winning_combos:
            if self.board[i] == self.board[j] == self.board[k] and self.board[i] != 0:
                return (1 if self.board[i] == 1 else -1), True  # +1 for X, -1 for O
        
        if 0 not in self.board:  # Draw condition
            return 0, True

        return 0, False  # No winner, continue game



In [5]:
# Value Function Approximation
class ValueFunctionAgent:
    def __init__(self, alpha=0.1, epsilon=0.1):
        self.value_table = {}  # State-value estimates
        self.alpha = alpha  # Learning rate
        self.epsilon = epsilon  # Exploration rate

    def get_value(self, state):
        """ Retrieve value of a state, default to 0 """
        return self.value_table.get(state, 0.5)  # Initialize unknown states at 0.5

    def choose_action(self, state, available_actions):
        """ Epsilon-greedy action selection """
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)  # Explore
        else:
            # Exploit: Select action that leads to the best estimated value
            values = [self.get_value(self.get_next_state(state, a)) for a in available_actions]
            return available_actions[np.argmax(values)]

    def get_next_state(self, state, action):
        """ Simulate next state """
        new_state = list(state)
        new_state[action] = 1  # Assume current player is AI
        return tuple(new_state)

    def update_value(self, state, reward, next_state):
        """ Update value function using TD-learning """
        self.value_table[state] = self.get_value(state) + self.alpha * (reward + self.get_value(next_state) - self.get_value(state))

    def save_value_table(self, filename="value_table.pkl"):
        with open(filename, "wb") as f:
            pickle.dump(self.value_table, f)

    def load_value_table(self, filename="value_table.pkl"):
        try:
            with open(filename, "rb") as f:
                self.value_table = pickle.load(f)
        except FileNotFoundError:
            print("No saved value table found, starting fresh.")



In [7]:
# Training the Value Function Agent
def train_agent(episodes=50000):
    env = TicTacToe()
    agent = ValueFunctionAgent()
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        trajectory = []

        while not done:
            available_actions = env.available_actions()
            action = agent.choose_action(state, available_actions)
            next_state, reward, done = env.step(action)

            # Store trajectory for Monte Carlo updates
            trajectory.append((state, action, reward))
            state = next_state

        # Monte Carlo update of Value Function
        for state, action, reward in reversed(trajectory):
            agent.update_value(state, reward, state)

    print("Training complete!")
    agent.save_value_table()
    return agent


In [9]:

# 10-Armed Testbed for Evaluating Agents
def test_10_agents():
    """ Simulates 10 agents with different exploration strategies """
    env = TicTacToe()
    agents = [ValueFunctionAgent(epsilon=i * 0.1) for i in range(10)]

    win_rates = np.zeros(10)

    for i, agent in enumerate(agents):
        wins = 0
        for _ in range(1000):  # Play 1000 games per agent
            state = env.reset()
            done = False

            while not done:
                available_actions = env.available_actions()
                action = agent.choose_action(state, available_actions)
                state, reward, done = env.step(action)

            if reward == 1:
                wins += 1

        win_rates[i] = wins / 1000  # Store win percentage

    print("Exploration Rate vs. Win Rate:")
    for i, win_rate in enumerate(win_rates):
        print(f"Epsilon = {i * 0.1}: Win Rate = {win_rate:.3f}")



In [11]:
# Play against trained AI
def play_against_ai():
    env = TicTacToe()
    agent = ValueFunctionAgent()
    agent.load_value_table()
    
    state = env.reset()
    done = False
    
    print("\nTic-Tac-Toe Game!")
    while not done:
        if env.current_player == 1:  # AI's turn
            action = agent.choose_action(state, env.available_actions())
            print(f"AI chooses: {action}")
        else:  # Human's turn
            print(f"\nCurrent Board:\n{np.array(env.board).reshape(3,3)}")
            action = int(input("Enter position (0-8): "))
            while action not in env.available_actions():
                action = int(input("Invalid move! Enter position (0-8): "))
        
        state, reward, done = env.step(action)

    print("\nFinal Board:\n", np.array(env.board).reshape(3,3))
    if reward == 1:
        print("AI Wins!")
    elif reward == -1:
        print("You Win!")
    else:
        print("It's a Draw!")



In [13]:
# Train and test agents
if __name__ == "__main__":
    train_agent(episodes=50000)  # Train AI with Value Function
    test_10_agents()  # Test different exploration strategies
    play_against_ai()  # Play against trained AI


Training complete!
Exploration Rate vs. Win Rate:
Epsilon = 0.0: Win Rate = 1.000
Epsilon = 0.1: Win Rate = 0.799
Epsilon = 0.2: Win Rate = 0.704
Epsilon = 0.30000000000000004: Win Rate = 0.634
Epsilon = 0.4: Win Rate = 0.624
Epsilon = 0.5: Win Rate = 0.580
Epsilon = 0.6000000000000001: Win Rate = 0.586
Epsilon = 0.7000000000000001: Win Rate = 0.579
Epsilon = 0.8: Win Rate = 0.600
Epsilon = 0.9: Win Rate = 0.602

Tic-Tac-Toe Game!
AI chooses: 0

Current Board:
[[1 0 0]
 [0 0 0]
 [0 0 0]]


Enter position (0-8):  3


AI chooses: 1

Current Board:
[[ 1  1  0]
 [-1  0  0]
 [ 0  0  0]]


Enter position (0-8):  2


AI chooses: 4

Current Board:
[[ 1  1 -1]
 [-1  1  0]
 [ 0  0  0]]


Enter position (0-8):  8


AI chooses: 5

Current Board:
[[ 1  1 -1]
 [-1  1  1]
 [ 0  0 -1]]


Enter position (0-8):  7


AI chooses: 6

Final Board:
 [[ 1  1 -1]
 [-1  1  1]
 [ 1 -1 -1]]
It's a Draw!
