In [1]:
import numpy as np

BOARD_ROWS = 3
BOARD_COLS = 3

In [342]:
import torch
import torch.nn as nn

class qnetwork(nn.Module):
    def __init__(self, input_dim=9, output_dim=9):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.net(x)

In [343]:
class Board():
    def __init__(self, p1, p2, mover):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.playerSymbol = mover
        self.boardHash = None

    def getHash(self):
        self.boardHash = self.board.reshape(BOARD_COLS * BOARD_ROWS)
        return self.boardHash
    
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i,j] == 0:
                    positions.append((i,j))
        return positions

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        self.playerSymbol *= -1

    def winner(self):
        # check rows
        for row in range(BOARD_ROWS):
            if sum(self.board[row, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[row, :]) == -3:
                self.isEnd = True
                return -1
        # check cols
        for col in range(BOARD_COLS):
            if sum(self.board[:, col]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, col]) == -3:
                self.isEnd = True
                return -1
            
        # check diagonals
        diag_sum1 = sum([self.board[i, i] for i in range(BOARD_COLS)])
        diag_sum2 = sum([self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == 3:
            self.isEnd = True
            if diag_sum1 == 3 or diag_sum2 == 3:
                return 1
            else:
                return -1

        # no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None    
    
    def availableMovesHash(self):
        return 1 - np.absolute(self.board.reshape(BOARD_COLS * BOARD_ROWS))

In [344]:
def training_move(model, current_board, epsilon):
    actions = current_board.availablePositions()
    
    if np.random.uniform(0, 1) <= epsilon:
        # take random action
        action = actions[np.random.choice(len(actions))]
    else:
        with torch.no_grad():
            X = torch.from_numpy(np.array(current_board.getHash())).float()
            qs = model.forward(X)
            
            # Get Q-values only for available positions
            available_indices = [pos[0] * BOARD_COLS + pos[1] for pos in actions]
            available_qs = qs[available_indices]
            
            # Select action with highest Q-value
            best_action_idx = torch.argmax(available_qs).item()
            action = actions[best_action_idx]

    return action

In [345]:
def play_game(model, epsilon=0.1):
    # Randomly decide who starts
    mover = np.random.choice([-1, 1])
    board = Board(1, -1, mover)
    
    transitions = []
    
    # Play until game ends
    while True:
        # Check if game is over
        winner = board.winner()
        if board.isEnd:
            break
            
        if board.playerSymbol == 1:  # Model's turn
            current_state = board.getHash().copy()
            available_actions = board.availableMovesHash().copy()
            
            # Initialize arrays for all 9 positions
            rewards = np.zeros(BOARD_ROWS * BOARD_COLS, dtype=np.float32)
            next_q_values = np.zeros(BOARD_ROWS * BOARD_COLS, dtype=np.float32)
            dones = np.zeros(BOARD_ROWS * BOARD_COLS, dtype=bool)
            
            # For each possible position, simulate the move
            for i in range(BOARD_ROWS):
                for j in range(BOARD_COLS):
                    action_index = i * BOARD_COLS + j
                    
                    # Skip if position is not available
                    if available_actions[action_index] == 0:
                        continue
                    
                    # Create a copy of the board to simulate
                    sim_board = Board(1, -1, board.playerSymbol)
                    sim_board.board = board.board.copy()
                    sim_board.isEnd = board.isEnd
                    
                    # Simulate the move
                    sim_board.updateState((i, j))
                    next_state = sim_board.getHash().copy()
                    
                    # Check if game ended after this move
                    winner = sim_board.winner()
                    if sim_board.isEnd:
                        if winner == 1:
                            rewards[action_index] = 1.0  # Win
                        elif winner == -1:
                            rewards[action_index] = -1.0  # Loss
                        else:
                            rewards[action_index] = 0.0  # Draw
                        dones[action_index] = True
                        next_q_values[action_index] = 0.0  # No future value if done
                    else:
                        rewards[action_index] = 0.0  # Game continues
                        dones[action_index] = False
                        
                        # Compute max Q-value for next state using NN
                        with torch.no_grad():
                            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
                            q_values = model(next_state_tensor).squeeze()
                            next_q_values[action_index] = q_values.max().item()
            
            # Store the transition with all possible actions analyzed
            transitions.append((current_state, available_actions, rewards, next_q_values, dones))
            
            # Actually make a move using epsilon-greedy
            action = training_move(model, board, epsilon)
            board.updateState(action)
            
        else:  # Computer's turn (player -1)
            actions = board.availablePositions()
            if len(actions) > 0:
                action = actions[np.random.choice(len(actions))]
                board.updateState(action)
    
    return transitions

In [351]:
model = qnetwork()
print(model)

qnetwork(
  (net): Sequential(
    (0): Linear(in_features=9, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=9, bias=True)
  )
)


In [356]:
training_data = []
gamma = 0.9

optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
loss_fn = torch.nn.MSELoss()

# Outer loop: repeat 10 times
for iteration in range(10):
    print(f"\n{'='*50}")
    print(f"Iteration {iteration + 1}/1000")
    print(f"{'='*50}")
    
    # Generate 1000 games
    print("Generating 100 games...")
    all_games = []
    for game_idx in range(1000):
        if (game_idx + 1) % 100 == 0:
            print(f"  Generated {game_idx + 1}/1000 games")
        game = play_game(model)
        all_games.extend(game)  # Flatten all transitions into one list
    
    # Convert all data to tensors once
    current_state, available_actions, reward, next_q_values, done = zip(*all_games)
    
    current_state = np.array(current_state, dtype=np.float32)
    available_actions = np.array(available_actions, dtype=np.float32)
    reward = np.array(reward, dtype=np.float32)
    next_q_values = np.array(next_q_values, dtype=np.float32)
    done = np.array(done, dtype=np.float32)
    
    Xs = torch.from_numpy(current_state)
    available_mask = torch.from_numpy(available_actions)
    rewards = torch.from_numpy(reward)
    next_qs = torch.from_numpy(next_q_values)
    dones = torch.from_numpy(done)
    
    print(f"Training on {len(all_games)} transitions for 100 epochs...")
    
    # Train 100 epochs on these games
    for epoch in range(10000):
        # Forward pass
        qs_predicted = model(Xs)
        
        # Compute target Q-values
        qs_target = rewards + gamma * next_qs * (1 - dones)
        
        # Compute loss (only on available actions)
        loss_value = loss_fn(qs_predicted, qs_target)
        
        # Backward pass
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        
        if (epoch + 1) % 100 == 0:
            print(f"  Epoch {epoch + 1}/100 - Loss: {loss_value.item():.6f}")
    
    print(f"Iteration {iteration + 1} complete. Final loss: {loss_value.item():.6f}")

print("\n" + "="*50)
print("Training complete!")
print("="*50)


Iteration 1/1000
Generating 100 games...
  Generated 100/1000 games
  Generated 200/1000 games
  Generated 300/1000 games
  Generated 400/1000 games
  Generated 500/1000 games
  Generated 600/1000 games
  Generated 700/1000 games
  Generated 800/1000 games
  Generated 900/1000 games
  Generated 1000/1000 games
Training on 3215 transitions for 100 epochs...
  Epoch 100/100 - Loss: 0.000472
  Epoch 200/100 - Loss: 0.000356
  Epoch 300/100 - Loss: 0.000291
  Epoch 400/100 - Loss: 0.000247
  Epoch 500/100 - Loss: 0.000213
  Epoch 600/100 - Loss: 0.000188
  Epoch 700/100 - Loss: 0.000168
  Epoch 800/100 - Loss: 0.000151
  Epoch 900/100 - Loss: 0.000136
  Epoch 1000/100 - Loss: 0.000123
  Epoch 1100/100 - Loss: 0.000113
  Epoch 1200/100 - Loss: 0.000103
  Epoch 1300/100 - Loss: 0.000094
  Epoch 1400/100 - Loss: 0.000087
  Epoch 1500/100 - Loss: 0.000080
  Epoch 1600/100 - Loss: 0.000074
  Epoch 1700/100 - Loss: 0.000069
  Epoch 1800/100 - Loss: 0.000064
  Epoch 1900/100 - Loss: 0.000060
  E

In [357]:
torch.save(model.state_dict(), "tictactoe.pth")

In [358]:
# Load the model
model = qnetwork()  # Create a new instance of your model
model.load_state_dict(torch.load("tictactoe.pth", weights_only=True))
model.eval()

def play_against_model():
    # Human is -1, Model is 1
    board = Board(1, -1, 1)  # Model starts (you can change this)
    
    print("You are X (-1), Model is O (1)")
    print("Positions are numbered 0-8 (row-major order):")
    print("0 | 1 | 2")
    print("3 | 4 | 5")
    print("6 | 7 | 8")
    print()
    
    while True:
        # Display board
        print("\nCurrent board:")
        for i in range(BOARD_ROWS):
            row_display = []
            for j in range(BOARD_COLS):
                if board.board[i, j] == 1:
                    row_display.append('O')
                elif board.board[i, j] == -1:
                    row_display.append('X')
                else:
                    row_display.append('.')
            print(" | ".join(row_display))
            if i < BOARD_ROWS - 1:
                print("-" * 9)
        print()
        
        # Check if game ended
        winner = board.winner()
        if board.isEnd:
            if winner == 1:
                print("Model wins!")
            elif winner == -1:
                print("You win!")
            else:
                print("It's a draw!")
            break
        
        if board.playerSymbol == 1:  # Model's turn
            print("Model is thinking...")
            state = torch.FloatTensor(board.getHash()).unsqueeze(0)
            
            with torch.no_grad():
                q_values = model(state).squeeze()
            
            # Choose best action
            action_index = np.argmax(q_values)
            action = (action_index // BOARD_COLS, action_index % BOARD_COLS)
            
            print(f"Model plays position {action_index}")
            board.updateState(action)
            
        else:  # Human's turn
            available = board.availablePositions()
            available_indices = [pos[0] * BOARD_COLS + pos[1] for pos in available]
            
            print(f"Available positions: {available_indices}")
            
            while True:
                try:
                    move = int(input("Enter your move (0-8): "))
                    if move in available_indices:
                        action = (move // BOARD_COLS, move % BOARD_COLS)
                        board.updateState(action)
                        break
                    else:
                        print("Invalid move! Position not available.")
                except:
                    print("Invalid input! Enter a number 0-8.")

# Play the game
play_against_model()

You are X (-1), Model is O (1)
Positions are numbered 0-8 (row-major order):
0 | 1 | 2
3 | 4 | 5
6 | 7 | 8


Current board:
. | . | .
---------
. | . | .
---------
. | . | .

Model is thinking...
Model plays position 0

Current board:
O | . | .
---------
. | . | .
---------
. | . | .

Available positions: [1, 2, 3, 4, 5, 6, 7, 8]

Current board:
O | . | X
---------
. | . | .
---------
. | . | .

Model is thinking...
Model plays position 6

Current board:
O | . | X
---------
. | . | .
---------
O | . | .

Available positions: [1, 3, 4, 5, 7, 8]

Current board:
O | . | X
---------
X | . | .
---------
O | . | .

Model is thinking...
Model plays position 5

Current board:
O | . | X
---------
X | . | O
---------
O | . | .

Available positions: [1, 4, 7, 8]
Invalid input! Enter a number 0-8.

Current board:
O | X | X
---------
X | . | O
---------
O | . | .

Model is thinking...
Model plays position 8

Current board:
O | X | X
---------
X | . | O
---------
O | . | O

Available positions: [4, 