In [84]:
from tictactoe import TicTacToeEnv, QNTicTacToe
import numpy as np

INIT_RANDOM_GAMES = 500
BATCH = 64
EPOCHS = 10
GAMMA = 0.9

model = QNTicTacToe()
env = TicTacToeEnv()

states = []
actions = []
next_states = []
rewards = []
dones = []

losses = []

In [85]:
for _ in range(INIT_RANDOM_GAMES):
    env.reset()
    done = False

    while not done:
        s = env.board.copy()
        a = env.random_action()
        ns, r, done = env.step(a)

        states.append(s)
        actions.append(a)
        next_states.append(ns)
        rewards.append(r)
        dones.append(done)

In [86]:
import random
import torch
import numpy as np

def evaluate_model(model, n_games=100):
    """Play against random opponent and see win rate"""
    wins = 0
    losses = 0
    draws = 0
    
    for _ in range(n_games):
        env.reset()
        done = False
        
        while not done:
            s = env.board.copy()
            with torch.no_grad():
                q_vals = model(torch.tensor(s, dtype=torch.float32))
                # Mask illegal moves
                available_mask = (s == 0)
                q_vals[~available_mask] = -float('inf')
                a = int(torch.argmax(q_vals).item())
            
            _, r, done = env.step(a)
        
        if r == 1:
            wins += 1
        elif r == -1:
            losses += 1
        else:
            draws += 1
    
    return wins/n_games*100, losses/n_games*100, draws/n_games*100

def train_n_games(model, target_model, optimizer, n_games=20000+1, epsilon_start=1.0, epsilon_end=0.05, epsilon_decay=0.9995):
    
    epsilon = epsilon_start
    losses_all = []

    for game in range(n_games):

        # --- play one game ---
        env.reset()
        done = False

        while not done:
            s = env.board.copy()

            with torch.no_grad():
                q_vals = model(torch.tensor(s, dtype=torch.float32))

            # epsilon-greedy
            if random.random() < epsilon:
                a = int(np.random.choice(np.where(s == 0)[0]))
            else:
                a = int(torch.argmax(q_vals).item())

            ns, r, done = env.step(a)

            states.append(s)
            actions.append(a)
            next_states.append(ns)
            rewards.append(r)
            dones.append(done)

        # decay epsilon
        epsilon = max(epsilon_end, epsilon * epsilon_decay)

        # update target network every 1000 games
        if game % 500 == 0 and game > 0:
            target_model.load_state_dict(model.state_dict())

        # --- training step ---
        states_t      = torch.tensor(states,      dtype=torch.float32)
        actions_t     = torch.tensor(actions,     dtype=torch.int64)
        next_states_t = torch.tensor(next_states, dtype=torch.float32)
        rewards_t     = torch.tensor(rewards,     dtype=torch.float32)
        dones_t       = torch.tensor(dones,       dtype=torch.float32)

        steps = len(states_t) // BATCH // 10

        for _ in range(steps):
            idx = np.random.randint(0, len(states_t), size=BATCH)

            batch_s  = states_t[idx]
            batch_a  = actions_t[idx]
            batch_ns = next_states_t[idx]
            batch_r  = rewards_t[idx]
            batch_d  = dones_t[idx]

            # Q(s,a)
            q_vals = model(batch_s)
            q_sa = q_vals.gather(1, batch_a.unsqueeze(1)).squeeze(1)

            # target
            with torch.no_grad():
                next_q = target_model(batch_ns)
                max_next_q = torch.max(next_q, dim=1).values
                target = batch_r + GAMMA * max_next_q * (1 - batch_d)

            loss = torch.nn.functional.mse_loss(q_sa, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            losses_all.append(loss.item())

        # print progress
        if game % 1000 == 0:
            w, l, d = evaluate_model(model, n_games=1000)
            print(f"Game {game},  ε={epsilon:.3f}, loss={losses_all[-1]:.4f}, Win rate: {w:.2f}%, Loss: {l:.2f}%, Draw: {d:.2f}%")

    return losses_all

In [62]:
model = QNTicTacToe()
target_model = QNTicTacToe()

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

losses = train_n_games(model, target_model, optimizer)

Game 0,  ε=1.000, loss=1.1677, Win rate: 55.20%, Loss: 33.40%, Draw: 11.40%
Game 1000,  ε=0.606, loss=1.8312, Win rate: 79.20%, Loss: 12.60%, Draw: 8.20%
Game 2000,  ε=0.368, loss=1.0649, Win rate: 87.90%, Loss: 11.10%, Draw: 1.00%
Game 3000,  ε=0.223, loss=0.8014, Win rate: 91.40%, Loss: 7.20%, Draw: 1.40%
Game 4000,  ε=0.135, loss=0.3670, Win rate: 90.40%, Loss: 7.30%, Draw: 2.30%
Game 5000,  ε=0.082, loss=0.9235, Win rate: 90.00%, Loss: 7.90%, Draw: 2.10%
Game 6000,  ε=0.050, loss=0.2551, Win rate: 90.50%, Loss: 7.30%, Draw: 2.20%
Game 7000,  ε=0.050, loss=0.2411, Win rate: 92.00%, Loss: 5.30%, Draw: 2.70%
Game 8000,  ε=0.050, loss=0.1349, Win rate: 93.30%, Loss: 2.70%, Draw: 4.00%
Game 9000,  ε=0.050, loss=0.0909, Win rate: 92.00%, Loss: 2.90%, Draw: 5.10%
Game 10000,  ε=0.050, loss=0.0729, Win rate: 92.70%, Loss: 2.90%, Draw: 4.40%
Game 11000,  ε=0.050, loss=0.1217, Win rate: 92.10%, Loss: 3.00%, Draw: 4.90%
Game 12000,  ε=0.050, loss=0.1194, Win rate: 93.70%, Loss: 2.80%, Draw: 3

In [63]:
torch.save(model.state_dict(), "model_1.pth")
print("Training finished. Model saved to", "model_1.pth")

Training finished. Model saved to model_1.pth


## **Keep training on higher quality games**

In [105]:
from tictactoe import TicTacToeEnv, QNTicTacToe
import numpy as np
import torch

INIT_SMART_GAMES = 500
BATCH = 1000
EPOCHS = 10
GAMMA = 0.9

# Load the trained model
model_3 = QNTicTacToe()
model_3.load_state_dict(torch.load("model_4.pth"))
model_3.eval()

env = TicTacToeEnv()

states = []
actions = []
next_states = []
rewards = []
dones = []

def select_action_with_model(model, state, epsilon=0.1):
    """Select action using model with optional epsilon-greedy"""
    if np.random.random() < epsilon:
        # Random action
        return env.random_action()
    else:
        # Model action
        with torch.no_grad():
            q_vals = model(torch.tensor(state, dtype=torch.float32))
            available_mask = (state == 0)
            q_vals[~available_mask] = -float('inf')
            return int(torch.argmax(q_vals).item())

print("Generating games with model_3 playing against itself...")
for game_idx in range(INIT_SMART_GAMES):
    env.reset()
    done = False

    while not done:
        s = env.board.copy()
        
        # Model plays as current player (with small epsilon for exploration)
        a = select_action_with_model(model_3, s, epsilon=0.1)
        
        ns, r, done = env.step(a)

        states.append(s)
        actions.append(a)
        next_states.append(ns)
        rewards.append(r)
        dones.append(done)
    
    if (game_idx + 1) % 100 == 0:
        print(f"Generated {game_idx + 1}/{INIT_SMART_GAMES} games")

print(f"\nCollected {len(states)} transitions from {INIT_SMART_GAMES} self-play games")
print(f"Average transitions per game: {len(states)/INIT_SMART_GAMES:.1f}")

# Check quality of generated games
wins = sum(1 for r in rewards if r == 1)
losses = sum(1 for r in rewards if r == -1)
draws = sum(1 for i, r in enumerate(rewards) if r == 0 and dones[i])  # Fix: move enumerate to front
print(f"\nGame outcomes from Player 1 perspective:")
print(f"Wins: {wins}, Losses: {losses}, Draws: {draws}")

Generating games with model_3 playing against itself...
Generated 100/500 games
Generated 200/500 games
Generated 300/500 games
Generated 400/500 games
Generated 500/500 games

Collected 1678 transitions from 500 self-play games
Average transitions per game: 3.4

Game outcomes from Player 1 perspective:
Wins: 464, Losses: 19, Draws: 17


In [106]:
import random
import torch
import numpy as np

def evaluate_model(model, n_games=100):
    """Play against random opponent and see win rate"""
    wins = 0
    losses = 0
    draws = 0
    
    for _ in range(n_games):
        env.reset()
        done = False
        
        while not done:
            s = env.board.copy()
            with torch.no_grad():
                q_vals = model(torch.tensor(s, dtype=torch.float32))
                # Mask illegal moves
                available_mask = (s == 0)
                q_vals[~available_mask] = -float('inf')
                a = int(torch.argmax(q_vals).item())
            
            _, r, done = env.step(a)
        
        if r == 1:
            wins += 1
        elif r == -1:
            losses += 1
        else:
            draws += 1
    
    return wins/n_games*100, losses/n_games*100, draws/n_games*100

def train_n_games(model, target_model, optimizer, n_games=20000+1, epsilon_start=1.0, epsilon_end=0.05, epsilon_decay=0.9995):
    
    epsilon = epsilon_start
    losses_all = []

    for game in range(n_games):

        # --- play one game ---
        env.reset()
        done = False

        while not done:
            s = env.board.copy()

            with torch.no_grad():
                q_vals = model(torch.tensor(s, dtype=torch.float32))

            # epsilon-greedy
            if random.random() < epsilon:
                a = int(np.random.choice(np.where(s == 0)[0]))
            else:
                a = int(torch.argmax(q_vals).item())

            ns, r, done = env.step(a)

            states.append(s)
            actions.append(a)
            next_states.append(ns)
            rewards.append(r)
            dones.append(done)

        # decay epsilon
        epsilon = max(epsilon_end, epsilon * epsilon_decay)

        # update target network every 1000 games
        if game % 500 == 0 and game > 0:
            target_model.load_state_dict(model.state_dict())

        # --- training step ---
        states_t      = torch.tensor(states,      dtype=torch.float32)
        actions_t     = torch.tensor(actions,     dtype=torch.int64)
        next_states_t = torch.tensor(next_states, dtype=torch.float32)
        rewards_t     = torch.tensor(rewards,     dtype=torch.float32)
        dones_t       = torch.tensor(dones,       dtype=torch.float32)

        steps = len(states_t) // BATCH

        for _ in range(steps):
            idx = np.random.randint(0, len(states_t), size=BATCH)

            batch_s  = states_t[idx]
            batch_a  = actions_t[idx]
            batch_ns = next_states_t[idx]
            batch_r  = rewards_t[idx]
            batch_d  = dones_t[idx]

            # Q(s,a)
            q_vals = model(batch_s)
            q_sa = q_vals.gather(1, batch_a.unsqueeze(1)).squeeze(1)

            # target
            with torch.no_grad():
                next_q = target_model(batch_ns)
                max_next_q = torch.max(next_q, dim=1).values
                target = batch_r + GAMMA * max_next_q * (1 - batch_d)

            loss = torch.nn.functional.mse_loss(q_sa, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            losses_all.append(loss.item())

        # print progress
        if game % 1000 == 0:
            w, l, d = evaluate_model(model, n_games=1000)
            print(f"Game {game},  ε={epsilon:.3f}, loss={losses_all[-1]:.4f}, Win rate: {w:.2f}%, Loss: {l:.2f}%, Draw: {d:.2f}%")

    return losses_all

In [102]:
# Create the new optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-7)

In [108]:
# Load the saved model
model = QNTicTacToe()
model.load_state_dict(torch.load("model_4.pth"))
model.eval()  # Optional: set to eval mode first

# Initialize target model with the same weights
target_model = QNTicTacToe()
target_model.load_state_dict(model.state_dict())

# Set back to training mode
model.train()
target_model.eval()  # Target model stays in eval mode

# Continue training
losses = train_n_games(model, target_model, optimizer, n_games=5000+1, epsilon_start=1, epsilon_end=0.0, epsilon_decay=0.9)

# Save again with a new name
torch.save(model.state_dict(), "model_4.pth")

Game 0,  ε=0.900, loss=0.0448, Win rate: 94.90%, Loss: 1.50%, Draw: 3.60%
Game 1000,  ε=0.000, loss=0.0326, Win rate: 95.20%, Loss: 2.20%, Draw: 2.60%
Game 2000,  ε=0.000, loss=0.0303, Win rate: 95.40%, Loss: 1.60%, Draw: 3.00%
Game 3000,  ε=0.000, loss=0.0317, Win rate: 95.30%, Loss: 2.10%, Draw: 2.60%
Game 4000,  ε=0.000, loss=0.0371, Win rate: 95.20%, Loss: 2.60%, Draw: 2.20%
Game 5000,  ε=0.000, loss=0.0305, Win rate: 95.30%, Loss: 2.90%, Draw: 1.80%


## **Making two models play against each other**

In [123]:
import torch
import numpy as np
from tictactoe import TicTacToeEnv, QNTicTacToe

def evaluate_models(model1, model2, n_games=10000, verbose=True):
    """
    Evaluate two models playing against each other.
    
    Args:
        model1: First model (plays as player 1)
        model2: Second model (plays as player 2)
        n_games: Number of games to play
        epsilon1: Exploration rate for model1 (0 = greedy)
        epsilon2: Exploration rate for model2 (0 = greedy)
        verbose: Whether to print results
    
    Returns:
        Dictionary with statistics
    """
    env = TicTacToeEnv()
    
    model1.eval()
    model2.eval()
    
    model1_wins = 0
    model2_wins = 0
    draws = 0
    
    def select_action(model, state):
            with torch.no_grad():
                q_vals = model(torch.tensor(state, dtype=torch.float32))
                return int(torch.argmax(q_vals).item())
    
    for _ in range(n_games):
        env.reset()
        done = False
        current_player = np.random.choice([-1,1])
        
        while not done:
            s = env.board.copy()
            
            # Model 1 plays as player 1, Model 2 plays as player 2
            if current_player == 1:
                a = select_action(model1, s)
            else:
                a = select_action(model2, -s)
            
            _, r, done = env.step(a)
            
            # Switch player for next turn
            current_player = -current_player
        
        # Count results from player 1's perspective (model1)
        if r == 1:
            model1_wins += 1
        elif r == -1:
            model2_wins += 1
        else:
            draws += 1
    
    # Calculate percentages
    stats = {
        'model1_wins': model1_wins,
        'model2_wins': model2_wins,
        'draws': draws,
        'model1_win_rate': model1_wins / n_games * 100,
        'model2_win_rate': model2_wins / n_games * 100,
        'draw_rate': draws / n_games * 100,
        'total_games': n_games
    }
    
    if verbose:
        print(f"\n{'='*60}")
        print(f"Model 1 vs Model 2 - {n_games} games")
        print(f"{'='*60}")
        print(f"Model 1 wins: {model1_wins:4d} ({stats['model1_win_rate']:5.2f}%)")
        print(f"Model 2 wins: {model2_wins:4d} ({stats['model2_win_rate']:5.2f}%)")
        print(f"Draws:        {draws:4d} ({stats['draw_rate']:5.2f}%)")
        print(f"{'='*60}\n")
    
    return stats

def compare_all_models(model_files, n_games=500):
    """Compare all models against each other"""
    models = []
    names = []
    
    # Load all models
    for file in model_files:
        model = QNTicTacToe()
        model.load_state_dict(torch.load(file))
        models.append(model)
        names.append(file)
    
    print(f"\nComparing {len(models)} models...\n")
    
    # Create results matrix
    results = {}
    
    for i, (model1, name1) in enumerate(zip(models, names)):
        for j, (model2, name2) in enumerate(zip(models, names)):
            if i < j:  # Only compare each pair once
                print(f"\n{name1} vs {name2}")
                stats = evaluate_models(model1, model2, n_games=n_games, verbose=True)
                results[f"{name1}_vs_{name2}"] = stats
    
    return results


# Usage:
compare_all_models(["model_4.pth", "model_1.pth"], n_games=10000)


Comparing 2 models...


model_4.pth vs model_1.pth

Model 1 vs Model 2 - 10000 games
Model 1 wins: 8826 (88.26%)
Model 2 wins:  622 ( 6.22%)
Draws:         552 ( 5.52%)



{'model_4.pth_vs_model_1.pth': {'model1_wins': 8826,
  'model2_wins': 622,
  'draws': 552,
  'model1_win_rate': 88.26,
  'model2_win_rate': 6.22,
  'draw_rate': 5.52,
  'total_games': 10000}}