In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
from ticTacToe import Model, View, Controller

## Tic Tac Toe - Learning to Play Games

### Chapters:
- Building a sandbox environment
- Random Baseline
- Minimax (Classical AI)
- Q-Learning (Reinforcement Learning & Deep Learning Revolution)
- Reinforcement Learning from Human Feedback

In [None]:
class RandomAI:
    def __init__(self, model, playerType):
        self.model = model
        self.playerType = playerType
    
    def getAvailableMoves(self, board):
        """Find all empty cells on the board"""
        moves = []
        for rowIdx, row in enumerate(board):
            for colIdx, cell in enumerate(row):
                if cell == "+":
                    moves.append((rowIdx, colIdx))
        return moves
    
    
    def getMove(self, model):
        self.model = model
        board = model.board
        availableMoves = self.getAvailableMoves(board)
        return availableMoves[random.randint(0, len(availableMoves) - 1)]

In [None]:
class RuleBasedAI:
    def __init__(self, model, playerType):
        self.model = model
        self.playerType = playerType
        # Set opponent type (if we're X, opponent is O and vice versa)
        self.opponentType = "O" if playerType == "X" else "X"
    
    def getAvailableMoves(self, board):
        """Find all empty cells on the board"""
        moves = []
        for rowIdx, row in enumerate(board):
            for colIdx, cell in enumerate(row):
                if cell == "+":
                    moves.append((rowIdx, colIdx))
        return moves
    
    #make check winning move into an exercise!
    def checkWinningMove(self, board, player):
        """
        Check if 'player' can win with one move.
        Returns the winning position (x, y) if found, otherwise None.
        """
        size = len(board)
        
        # Check each row for a winning opportunity
        for row in range(size):
            cells = [board[row][col] for col in range(size)]
            if cells.count(player) == size - 1 and cells.count("+") == 1:
                col = cells.index("+")
                return (row, col)
        
        # Check each column for a winning opportunity
        for col in range(size):
            cells = [board[row][col] for row in range(size)]
            if cells.count(player) == size - 1 and cells.count("+") == 1:
                row = cells.index("+")
                return (row, col)
        
        # Check main diagonal (top-left to bottom-right)
        cells = [board[i][i] for i in range(size)]
        if cells.count(player) == size - 1 and cells.count("+") == 1:
            idx = cells.index("+")
            return (idx, idx)
        
        # Check anti-diagonal (top-right to bottom-left)
        cells = [board[i][size - 1 - i] for i in range(size)]
        if cells.count(player) == size - 1 and cells.count("+") == 1:
            idx = cells.index("+")
            return (idx, size - 1 - idx)
        
        return None
    
    def getMove(self, model):
        self.model = model
        board = model.board
        
        # Rule 1: If we can win, take the winning move!
        winMove = self.checkWinningMove(board, self.playerType)
        if winMove:
            return winMove
        
        # Rule 2: If opponent can win, block them!
        blockMove = self.checkWinningMove(board, self.opponentType)
        if blockMove:
            return blockMove
        
        # Rule 3: Otherwise, pick a random available move
        availableMoves = self.getAvailableMoves(board)
        return availableMoves[random.randint(0, len(availableMoves) - 1)]

In [None]:
# =============================================================================
# Q-LEARNING AI - The Simplest Reinforcement Learning
# =============================================================================
#
# THE BIG IDEA:
# Instead of programming rules (like RuleBasedAI) or searching all possibilities
# (like Minimax), we let the AI LEARN from experience by playing many games.
#
# KEY CONCEPT - THE Q-TABLE:
# A dictionary that maps: (board_state, action) → "how good is this move?"
#
#   Q-Table (simplified example):
#   ┌─────────────────────┬────────┬─────────┐
#   │ State (board)       │ Action │ Q-Value │
#   ├─────────────────────┼────────┼─────────┤
#   │ "X++|+++|+++"       │ (1,1)  │  0.8    │  ← "center is good when X plays first"
#   │ "X++|+++|+++"       │ (0,1)  │  0.2    │  ← "edge is okay"
#   │ "XO+|+++|+++"       │ (1,1)  │  0.9    │  ← "center is great here too"
#   └─────────────────────┴────────┴─────────┘
#
# HOW IT LEARNS (after each game):
# 1. If we WON:  increase Q-values for all moves we made (reward = +1)
# 2. If we LOST: decrease Q-values for all moves we made (reward = -1)  
# 3. If TIE:     slightly increase Q-values (reward = +0.5)
#
# EXPLORATION VS EXPLOITATION:
# - Exploration: Try random moves to discover new strategies (epsilon chance)
# - Exploitation: Use the best known move from Q-table (1 - epsilon chance)
# - Start with high exploration, gradually reduce it as we learn
#
# =============================================================================

class QLearningAI:
    def __init__(self, model, playerType, epsilon=0.3, learning_rate=0.5):
        self.model = model
        self.playerType = playerType
        
        # THE Q-TABLE: stores (state, action) → value
        # Key: (board_as_string, (row, col))
        # Value: how good we think this move is (starts at 0)
        self.q_table = {}
        
        # EXPLORATION RATE: probability of trying a random move
        # Higher = more exploration, Lower = more exploitation
        self.epsilon = epsilon
        
        # LEARNING RATE: how much we update Q-values after each game
        # Higher = learn faster but less stable
        self.learning_rate = learning_rate
        
        # Track moves made during current game (for updating Q-values later)
        self.move_history = []
    
    def boardToString(self, board):
        """
        Convert board to a string so we can use it as a dictionary key.
        Example: [["X","+","+"],["O","+","+"],["+","+","+"]] → "X++|O++|+++"
        """
        return "|".join(["".join(row) for row in board])
    
    def getAvailableMoves(self, board):
        """Find all empty cells"""
        moves = []
        for rowIdx, row in enumerate(board):
            for colIdx, cell in enumerate(row):
                if cell == "+":
                    moves.append((rowIdx, colIdx))
        return moves
    
    def getQValue(self, state, action):
        """
        Get the Q-value for a (state, action) pair.
        Returns 0 if we've never seen this combination before.
        """
        return self.q_table.get((state, action), 0.0)
    
    def getMove(self, model):
        """
        Choose a move using epsilon-greedy strategy:
        - With probability epsilon: pick a RANDOM move (explore)
        - With probability 1-epsilon: pick the BEST known move (exploit)
        """
        self.model = model
        board = model.board
        state = self.boardToString(board)
        availableMoves = self.getAvailableMoves(board)
        
        # EXPLORATION: pick a random move
        if random.random() < self.epsilon:
            action = random.choice(availableMoves)
        
        # EXPLOITATION: pick the move with highest Q-value
        else:
            # Find the best move (highest Q-value)
            best_value = float('-inf')
            best_moves = []
            
            for move in availableMoves:
                q_value = self.getQValue(state, move)
                if q_value > best_value:
                    best_value = q_value
                    best_moves = [move]
                elif q_value == best_value:
                    best_moves.append(move)
            
            # If multiple moves have same Q-value, pick randomly among them
            action = random.choice(best_moves)
        
        # Remember this move so we can update Q-values after the game
        self.move_history.append((state, action))
        
        return action
    

    #make this into an excerise!
    def learn(self, reward):
        """
        Update Q-values after a game ends.
        Called with reward: +1 (win), -1 (loss), or +0.5 (tie)
        
        Simple update rule:
        Q(state, action) = Q(state, action) + learning_rate * (reward - Q(state, action))
        
        This moves the Q-value toward the reward we received.
        """
        # Update Q-value for each move we made this game
        for (state, action) in self.move_history:
            old_value = self.getQValue(state, action)
            # Move Q-value toward the reward
            new_value = old_value + self.learning_rate * (reward - old_value)
            self.q_table[(state, action)] = new_value
        
        # Clear history for next game
        self.move_history = []
    
    def decayEpsilon(self, decay_rate=0.99):
        """
        Reduce exploration over time.
        As we learn more, we should exploit our knowledge more.
        """
        self.epsilon = self.epsilon * decay_rate

In [None]:
# =============================================================================
# TRAINING THE Q-LEARNING AI
# =============================================================================
# We train by playing many games against a RandomAI opponent.
# After each game, we update Q-values based on win/loss/tie.
# =============================================================================

def trainQLearning(num_games=5000):
    """Train Q-Learning AI by playing against RandomAI"""
    
    # Create the Q-Learning agent (starts knowing nothing)
    q_ai = QLearningAI(model=None, playerType="X", epsilon=0.5, learning_rate=0.5)
    
    # Track results for plotting
    results = {"wins": 0, "losses": 0, "ties": 0}
    win_rates = []  # Track win rate over time
    
    for game_num in range(num_games):
        # Create fresh game
        board = [["+" for _ in range(3)] for _ in range(3)]
        playerX = Player("X")
        playerO = Player("O")
        model = Model(board, "X", playerX, playerO)
        
        # Create opponent (random player)
        opponent = RandomAI(model=None, playerType="O")
        
        # Play the game
        while not model.gameOver:
            if model.turn == "X":
                # Q-Learning AI's turn
                x, y = q_ai.getMove(model)
            else:
                # Opponent's turn
                x, y = opponent.getMove(model)
            
            model.makeMove(x, y)
            model.checkGameOver()
            
            if not model.gameOver:
                model.turn = "O" if model.turn == "X" else "X"
        
        # Game over - give reward to Q-Learning AI
        if model.winner == "X":
            q_ai.learn(reward=1.0)   # Win!
            results["wins"] += 1
        elif model.winner == "O":
            q_ai.learn(reward=-1.0)  # Loss
            results["losses"] += 1
        else:
            q_ai.learn(reward=0.5)   # Tie
            results["ties"] += 1
        
        # Reduce exploration over time
        q_ai.decayEpsilon(decay_rate=0.999)
        
        # Track win rate every 100 games
        if (game_num + 1) % 100 == 0:
            recent_wr = results["wins"] / (game_num + 1) * 100
            win_rates.append(recent_wr)
    
    # Print final results
    total = num_games
    print(f"Training complete! Results after {num_games} games:")
    print(f"  Wins:   {results['wins']} ({results['wins']/total*100:.1f}%)")
    print(f"  Losses: {results['losses']} ({results['losses']/total*100:.1f}%)")
    print(f"  Ties:   {results['ties']} ({results['ties']/total*100:.1f}%)")
    print(f"  Q-table size: {len(q_ai.q_table)} state-action pairs learned")
    print(f"  Final epsilon: {q_ai.epsilon:.4f}")
    
    # Plot learning progress
    plt.figure(figsize=(10, 4))
    plt.plot(range(100, num_games + 1, 100), win_rates)
    plt.xlabel("Games Played")
    plt.ylabel("Win Rate (%)")
    plt.title("Q-Learning AI: Win Rate Over Training")
    plt.grid(True)
    plt.show()
    
    return q_ai

# Train the AI!
trained_q_ai = trainQLearning(num_games=5000)

In [None]:
# =============================================================================
# RLHF AI - Reinforcement Learning from Human Feedback
# =============================================================================
#
# THE KEY DIFFERENCE FROM Q-LEARNING:
# Instead of learning from win/loss, we learn from HUMAN PREFERENCES.
# We show the human two possible moves and ask: "Which is better?"
#
# HOW IT WORKS:
# 1. Given a board state, generate two possible moves
# 2. Show both options to the human
# 3. Human picks which move they prefer
# 4. Update scores: preferred move gets +1, rejected move gets -1
#
# THIS DEMONSTRATES:
# - How modern AI (like ChatGPT) learns from human preferences
# - REWARD MISSPECIFICATION: AI learns what humans REWARD, not what they WANT
#   Example: If humans always pick "aggressive" moves, AI becomes aggressive
#            even when defensive play would win the game
#
# =============================================================================

class RLHF_AI:
    def __init__(self, model, playerType, learning_rate=0.3):
        self.model = model
        self.playerType = playerType
        
        # PREFERENCE TABLE: like Q-table but learned from human comparisons
        # Maps (state, action) → preference score
        self.preference_table = {}
        
        self.learning_rate = learning_rate
        
        # Track training stats
        self.comparisons_made = 0
    
    def boardToString(self, board):
        """Convert board to string for use as dictionary key"""
        return "|".join(["".join(row) for row in board])
    
    def getAvailableMoves(self, board):
        """Find all empty cells"""
        moves = []
        for rowIdx, row in enumerate(board):
            for colIdx, cell in enumerate(row):
                if cell == "+":
                    moves.append((rowIdx, colIdx))
        return moves
    
    def getPreference(self, state, action):
        """Get preference score for a (state, action) pair"""
        return self.preference_table.get((state, action), 0.0)
    
    def displayBoard(self, board):
        """Display board state for human review"""
        print("    0   1   2")
        print("  +---+---+---+")
        for idx, row in enumerate(board):
            print(f"{idx} | {' | '.join(row)} |")
            print("  +---+---+---+")
    
    def displayComparison(self, board, move_a, move_b):
        """Show the human two possible moves to compare"""
        print("\n" + "="*50)
        print("CURRENT BOARD:")
        self.displayBoard(board)
        
        print(f"\nYou are playing as: {self.playerType}")
        print("\nWhich move is BETTER?")
        print(f"  [A] Play at position {move_a} (row={move_a[0]}, col={move_a[1]})")
        print(f"  [B] Play at position {move_b} (row={move_b[0]}, col={move_b[1]})")
        print("="*50)
    
    def collectHumanPreference(self, board, move_a, move_b):
        """
        Ask human to compare two moves.
        Returns: (preferred_move, rejected_move)
        """
        self.displayComparison(board, move_a, move_b)
        
        while True:
            choice = input("Enter A or B (or 'skip' to skip): ").strip().upper()
            if choice == 'A':
                return (move_a, move_b)
            elif choice == 'B':
                return (move_b, move_a)
            elif choice == 'SKIP':
                return (None, None)
            else:
                print("Invalid input. Please enter A, B, or 'skip'")
    
    def updateFromComparison(self, state, preferred, rejected):
        """
        Update preference scores based on human choice.
        - Preferred move: increase score
        - Rejected move: decrease score
        """
        if preferred is None:
            return
        
        # Increase score for preferred move
        old_pref = self.getPreference(state, preferred)
        self.preference_table[(state, preferred)] = old_pref + self.learning_rate
        
        # Decrease score for rejected move
        old_rej = self.getPreference(state, rejected)
        self.preference_table[(state, rejected)] = old_rej - self.learning_rate
        
        self.comparisons_made += 1
    
    def train(self, num_comparisons=10):
        """
        Interactive training session.
        Shows random board states and asks human to compare moves.
        """
        print("\n" + "="*50)
        print("RLHF TRAINING SESSION")
        print("="*50)
        print("You will be shown board positions with two possible moves.")
        print("Pick which move you think is BETTER.")
        print("Your preferences will train the AI!\n")
        
        for i in range(num_comparisons):
            print(f"\n--- Comparison {i+1}/{num_comparisons} ---")
            
            # Generate a random board state (partially filled)
            board = [["+" for _ in range(3)] for _ in range(3)]
            num_moves = random.randint(0, 5)  # 0-5 moves already made
            
            pieces = ["X", "O"]
            for j in range(num_moves):
                available = self.getAvailableMoves(board)
                if available:
                    r, c = random.choice(available)
                    board[r][c] = pieces[j % 2]
            
            # Get available moves
            available = self.getAvailableMoves(board)
            if len(available) < 2:
                print("(Skipping - not enough moves available)")
                continue
            
            # Pick two random moves to compare
            move_a, move_b = random.sample(available, 2)
            
            # Get human preference
            state = self.boardToString(board)
            preferred, rejected = self.collectHumanPreference(board, move_a, move_b)
            
            # Update from preference
            self.updateFromComparison(state, preferred, rejected)
        
        print("\n" + "="*50)
        print(f"Training complete! Made {self.comparisons_made} comparisons.")
        print(f"Preference table size: {len(self.preference_table)} entries")
        print("="*50)
    
    def getMove(self, model):
        """
        Choose best move according to learned human preferences.
        """
        self.model = model
        board = model.board
        state = self.boardToString(board)
        available = self.getAvailableMoves(board)
        
        # Find move with highest preference score
        best_score = float('-inf')
        best_moves = []
        
        for move in available:
            score = self.getPreference(state, move)
            if score > best_score:
                best_score = score
                best_moves = [move]
            elif score == best_score:
                best_moves.append(move)
        
        return random.choice(best_moves)

In [None]:
# =============================================================================
# DEMO: REWARD MISSPECIFICATION
# =============================================================================
# This demonstrates how RLHF can go wrong when humans reward the WRONG thing.
#
# We simulate a "biased human" who always prefers:
#   - Center moves (1,1)
#   - Corner moves (0,0), (0,2), (2,0), (2,2)
# ...even when an edge move would be strategically better!
#
# The AI will learn this bias and play "stylishly" instead of optimally.
# =============================================================================

class BiasedHumanSimulator:
    """
    Simulates a human with a BIAS toward center/corner moves.
    This demonstrates reward misspecification - the AI learns our
    aesthetic preference, not optimal play.
    """
    
    def __init__(self, bias_type="center_corner"):
        self.bias_type = bias_type
        # Preferred positions (center and corners)
        self.preferred = [(1, 1), (0, 0), (0, 2), (2, 0), (2, 2)]
    
    def choose(self, move_a, move_b):
        """
        Pick between two moves based on bias, not strategy.
        Returns: (preferred, rejected)
        """
        a_preferred = move_a in self.preferred
        b_preferred = move_b in self.preferred
        
        if a_preferred and not b_preferred:
            return (move_a, move_b)
        elif b_preferred and not a_preferred:
            return (move_b, move_a)
        else:
            # Both or neither preferred - pick randomly
            if random.random() < 0.5:
                return (move_a, move_b)
            else:
                return (move_b, move_a)


def demonstrateRewardMisspecification(num_comparisons=200):
    """
    Train RLHF AI with a biased human simulator.
    Then test it against RandomAI to see how the bias affects performance.
    """
    print("="*60)
    print("REWARD MISSPECIFICATION DEMO")
    print("="*60)
    print("\nTraining an AI with a BIASED human who prefers center/corners...")
    print("The human doesn't consider whether the move actually helps WIN.\n")
    
    # Create AI and biased human
    rlhf_ai = RLHF_AI(model=None, playerType="X")
    biased_human = BiasedHumanSimulator()
    
    # Simulate training with biased feedback
    for i in range(num_comparisons):
        # Generate random board state
        board = [["+" for _ in range(3)] for _ in range(3)]
        num_moves = random.randint(0, 5)
        
        pieces = ["X", "O"]
        for j in range(num_moves):
            available = rlhf_ai.getAvailableMoves(board)
            if available:
                r, c = random.choice(available)
                board[r][c] = pieces[j % 2]
        
        available = rlhf_ai.getAvailableMoves(board)
        if len(available) < 2:
            continue
        
        move_a, move_b = random.sample(available, 2)
        state = rlhf_ai.boardToString(board)
        
        # Get BIASED human preference (not optimal!)
        preferred, rejected = biased_human.choose(move_a, move_b)
        rlhf_ai.updateFromComparison(state, preferred, rejected)
    
    print(f"Training complete! {rlhf_ai.comparisons_made} comparisons made.\n")
    
    # Show what the AI learned
    print("What the AI learned (preference scores):")
    print("  Center (1,1): HIGHLY preferred")
    print("  Corners: preferred")
    print("  Edges: NOT preferred")
    print("\nThis is REWARD MISSPECIFICATION - the AI learned our style,")
    print("not how to win!\n")
    
    # Test against RandomAI
    print("Testing biased RLHF AI vs RandomAI (500 games)...")
    wins = 0
    losses = 0
    ties = 0
    
    for _ in range(500):
        board = [["+" for _ in range(3)] for _ in range(3)]
        playerX = Player("X")
        playerO = Player("O")
        model = Model(board, "X", playerX, playerO)
        opponent = RandomAI(model=None, playerType="O")
        
        while not model.gameOver:
            if model.turn == "X":
                x, y = rlhf_ai.getMove(model)
            else:
                x, y = opponent.getMove(model)
            
            model.makeMove(x, y)
            model.checkGameOver()
            if not model.gameOver:
                model.turn = "O" if model.turn == "X" else "X"
        
        if model.winner == "X":
            wins += 1
        elif model.winner == "O":
            losses += 1
        else:
            ties += 1
    
    print(f"\nResults:")
    print(f"  Wins:   {wins} ({wins/5:.1f}%)")
    print(f"  Losses: {losses} ({losses/5:.1f}%)")
    print(f"  Ties:   {ties} ({ties/5:.1f}%)")
    
    print("\n" + "="*60)
    print("KEY INSIGHT:")
    print("="*60)
    print("The AI plays 'stylishly' (center/corners) but not OPTIMALLY.")
    print("It learned what we REWARDED, not what we actually WANTED (winning).")
    print("This is why specifying rewards correctly is so important in AI!")
    print("="*60)
    
    return rlhf_ai

# Run the demo!
biased_rlhf_ai = demonstrateRewardMisspecification()

In [None]:
# =============================================================================
# COMPARING MOVE DISTRIBUTIONS: Q-Learning vs RLHF
# =============================================================================
# This visualizes WHERE each AI likes to play, showing how biased human
# feedback changes the AI's behavior compared to learning from wins/losses.
# =============================================================================

def compareMoveDitributions(num_training_games=3000, num_test_games=500):
    """
    Train both Q-Learning and RLHF (with biased human), then compare
    which positions they prefer to play.
    """
    
    print("="*70)
    print("MOVE DISTRIBUTION COMPARISON: Q-Learning vs Biased RLHF")
    print("="*70)
    
    # -------------------------------------------------------------------------
    # 1. Train Q-Learning AI (learns from wins/losses)
    # -------------------------------------------------------------------------
    print("\n[1/4] Training Q-Learning AI (learns from game outcomes)...")
    
    q_ai = QLearningAI(model=None, playerType="X", epsilon=0.5, learning_rate=0.5)
    
    for _ in range(num_training_games):
        board = [["+" for _ in range(3)] for _ in range(3)]
        playerX = Player("X")
        playerO = Player("O")
        model = Model(board, "X", playerX, playerO)
        opponent = RandomAI(model=None, playerType="O")
        
        while not model.gameOver:
            if model.turn == "X":
                x, y = q_ai.getMove(model)
            else:
                x, y = opponent.getMove(model)
            model.makeMove(x, y)
            model.checkGameOver()
            if not model.gameOver:
                model.turn = "O" if model.turn == "X" else "X"
        
        if model.winner == "X":
            q_ai.learn(reward=1.0)
        elif model.winner == "O":
            q_ai.learn(reward=-1.0)
        else:
            q_ai.learn(reward=0.5)
        q_ai.decayEpsilon(decay_rate=0.999)
    
    q_ai.epsilon = 0  # No exploration during testing
    print(f"   Q-Learning trained! Q-table size: {len(q_ai.q_table)}")
    
    # -------------------------------------------------------------------------
    # 2. Train RLHF AI (learns from biased human preferences)
    # -------------------------------------------------------------------------
    print("\n[2/4] Training RLHF AI (learns from biased human who loves center/corners)...")
    
    rlhf_ai = RLHF_AI(model=None, playerType="X", learning_rate=0.3)
    biased_human = BiasedHumanSimulator()
    
    for _ in range(500):  # 500 comparisons
        board = [["+" for _ in range(3)] for _ in range(3)]
        num_moves = random.randint(0, 5)
        pieces = ["X", "O"]
        for j in range(num_moves):
            available = rlhf_ai.getAvailableMoves(board)
            if available:
                r, c = random.choice(available)
                board[r][c] = pieces[j % 2]
        
        available = rlhf_ai.getAvailableMoves(board)
        if len(available) < 2:
            continue
        
        move_a, move_b = random.sample(available, 2)
        state = rlhf_ai.boardToString(board)
        preferred, rejected = biased_human.choose(move_a, move_b)
        rlhf_ai.updateFromComparison(state, preferred, rejected)
    
    print(f"   RLHF trained! {rlhf_ai.comparisons_made} comparisons made.")
    
    # -------------------------------------------------------------------------
    # 3. Collect move distributions from test games
    # -------------------------------------------------------------------------
    print(f"\n[3/4] Playing {num_test_games} test games with each AI...")
    
    # Track moves for each AI: 3x3 grid of counts
    q_moves = np.zeros((3, 3))
    rlhf_moves = np.zeros((3, 3))
    
    # Q-Learning test games
    q_wins, q_losses, q_ties = 0, 0, 0
    for _ in range(num_test_games):
        board = [["+" for _ in range(3)] for _ in range(3)]
        playerX = Player("X")
        playerO = Player("O")
        model = Model(board, "X", playerX, playerO)
        opponent = RandomAI(model=None, playerType="O")
        
        while not model.gameOver:
            if model.turn == "X":
                x, y = q_ai.getMove(model)
                q_moves[x][y] += 1  # Track the move
            else:
                x, y = opponent.getMove(model)
            model.makeMove(x, y)
            model.checkGameOver()
            if not model.gameOver:
                model.turn = "O" if model.turn == "X" else "X"
        
        if model.winner == "X":
            q_wins += 1
        elif model.winner == "O":
            q_losses += 1
        else:
            q_ties += 1
    
    # RLHF test games
    rlhf_wins, rlhf_losses, rlhf_ties = 0, 0, 0
    for _ in range(num_test_games):
        board = [["+" for _ in range(3)] for _ in range(3)]
        playerX = Player("X")
        playerO = Player("O")
        model = Model(board, "X", playerX, playerO)
        opponent = RandomAI(model=None, playerType="O")
        
        while not model.gameOver:
            if model.turn == "X":
                x, y = rlhf_ai.getMove(model)
                rlhf_moves[x][y] += 1  # Track the move
            else:
                x, y = opponent.getMove(model)
            model.makeMove(x, y)
            model.checkGameOver()
            if not model.gameOver:
                model.turn = "O" if model.turn == "X" else "X"
        
        if model.winner == "X":
            rlhf_wins += 1
        elif model.winner == "O":
            rlhf_losses += 1
        else:
            rlhf_ties += 1
    
    # -------------------------------------------------------------------------
    # 4. Visualize the results
    # -------------------------------------------------------------------------
    print("\n[4/4] Creating visualizations...")
    
    # Normalize to percentages
    q_moves_pct = q_moves / q_moves.sum() * 100
    rlhf_moves_pct = rlhf_moves / rlhf_moves.sum() * 100
    
    # Create figure with multiple plots
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Position labels for the board
    position_labels = [
        ["Corner\n(0,0)", "Edge\n(0,1)", "Corner\n(0,2)"],
        ["Edge\n(1,0)", "CENTER\n(1,1)", "Edge\n(1,2)"],
        ["Corner\n(2,0)", "Edge\n(2,1)", "Corner\n(2,2)"]
    ]
    
    # Plot 1: Q-Learning heatmap
    im1 = axes[0].imshow(q_moves_pct, cmap='Blues', vmin=0, vmax=max(q_moves_pct.max(), rlhf_moves_pct.max()))
    axes[0].set_title(f'Q-Learning Move Distribution\nWins: {q_wins/num_test_games*100:.1f}%', fontsize=12)
    for i in range(3):
        for j in range(3):
            axes[0].text(j, i, f'{q_moves_pct[i,j]:.1f}%', ha='center', va='center', fontsize=14, fontweight='bold')
    axes[0].set_xticks([0, 1, 2])
    axes[0].set_yticks([0, 1, 2])
    axes[0].set_xticklabels(['Col 0', 'Col 1', 'Col 2'])
    axes[0].set_yticklabels(['Row 0', 'Row 1', 'Row 2'])
    
    # Plot 2: RLHF heatmap
    im2 = axes[1].imshow(rlhf_moves_pct, cmap='Reds', vmin=0, vmax=max(q_moves_pct.max(), rlhf_moves_pct.max()))
    axes[1].set_title(f'RLHF (Biased) Move Distribution\nWins: {rlhf_wins/num_test_games*100:.1f}%', fontsize=12)
    for i in range(3):
        for j in range(3):
            axes[1].text(j, i, f'{rlhf_moves_pct[i,j]:.1f}%', ha='center', va='center', fontsize=14, fontweight='bold')
    axes[1].set_xticks([0, 1, 2])
    axes[1].set_yticks([0, 1, 2])
    axes[1].set_xticklabels(['Col 0', 'Col 1', 'Col 2'])
    axes[1].set_yticklabels(['Row 0', 'Row 1', 'Row 2'])
    
    # Plot 3: Bar chart comparison
    positions = ['Corners\n(0,0)(0,2)\n(2,0)(2,2)', 'Edges\n(0,1)(1,0)\n(1,2)(2,1)', 'Center\n(1,1)']
    
    # Calculate totals for each category
    q_corners = q_moves_pct[0,0] + q_moves_pct[0,2] + q_moves_pct[2,0] + q_moves_pct[2,2]
    q_edges = q_moves_pct[0,1] + q_moves_pct[1,0] + q_moves_pct[1,2] + q_moves_pct[2,1]
    q_center = q_moves_pct[1,1]
    
    rlhf_corners = rlhf_moves_pct[0,0] + rlhf_moves_pct[0,2] + rlhf_moves_pct[2,0] + rlhf_moves_pct[2,2]
    rlhf_edges = rlhf_moves_pct[0,1] + rlhf_moves_pct[1,0] + rlhf_moves_pct[1,2] + rlhf_moves_pct[2,1]
    rlhf_center = rlhf_moves_pct[1,1]
    
    x = np.arange(3)
    width = 0.35
    
    bars1 = axes[2].bar(x - width/2, [q_corners, q_edges, q_center], width, label='Q-Learning', color='steelblue')
    bars2 = axes[2].bar(x + width/2, [rlhf_corners, rlhf_edges, rlhf_center], width, label='RLHF (Biased)', color='indianred')
    
    axes[2].set_ylabel('% of Total Moves')
    axes[2].set_title('Move Type Comparison', fontsize=12)
    axes[2].set_xticks(x)
    axes[2].set_xticklabels(positions)
    axes[2].legend()
    axes[2].set_ylim(0, 60)
    
    # Add value labels on bars
    for bar in bars1:
        height = bar.get_height()
        axes[2].text(bar.get_x() + bar.get_width()/2., height, f'{height:.1f}%', ha='center', va='bottom', fontsize=10)
    for bar in bars2:
        height = bar.get_height()
        axes[2].text(bar.get_x() + bar.get_width()/2., height, f'{height:.1f}%', ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("\n" + "="*70)
    print("RESULTS SUMMARY")
    print("="*70)
    print(f"\n{'Metric':<25} {'Q-Learning':>15} {'RLHF (Biased)':>15}")
    print("-"*55)
    print(f"{'Win Rate':<25} {q_wins/num_test_games*100:>14.1f}% {rlhf_wins/num_test_games*100:>14.1f}%")
    print(f"{'Center moves':<25} {q_center:>14.1f}% {rlhf_center:>14.1f}%")
    print(f"{'Corner moves':<25} {q_corners:>14.1f}% {rlhf_corners:>14.1f}%")
    print(f"{'Edge moves':<25} {q_edges:>14.1f}% {rlhf_edges:>14.1f}%")
    
    print("\n" + "="*70)
    print("KEY INSIGHT")
    print("="*70)
    print("""
    Q-Learning learned from WINS → plays strategically
    RLHF learned from BIASED HUMAN → plays "stylishly" (center/corners)
    
    Notice how RLHF has:
    - HIGHER center/corner usage (what the biased human rewarded)
    - LOWER edge usage (what the biased human ignored)
    - LOWER win rate (because style ≠ strategy)
    
    The AI perfectly learned what we taught it...
    but we taught it the WRONG thing!
    """)
    print("="*70)
    
    return q_ai, rlhf_ai

# Run the comparison!
q_ai_trained, rlhf_ai_trained = compareMoveDitributions()