In [19]:
# First remove conflicting packages
!pip uninstall -y dopamine-rl gym

# Then install specific compatible versions
!pip install numpy==1.23.5 torch==2.0.1 matplotlib==3.7.1 tqdm==4.65.0 gym==0.25.2

# For visualization extensions
!pip install pygame==2.3.0

[0mFound existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Successfully uninstalled gym-0.25.2
Collecting gym==0.25.2
  Using cached gym-0.25.2-py3-none-any.whl
Installing collected packages: gym
Successfully installed gym-0.25.2


From the UTTT implementation, we can borrow the dual-head architecture and improved convolutional layers:

In [20]:
# Core Game Implementation
import numpy as np
import random
import matplotlib


# Neural Network/RL Components
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Visualization
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Progress Tracking
from tqdm import tqdm

# Game Environment Utilities
from copy import deepcopy
from collections import deque

In [21]:
# Fix missing parenthesis and formatting
def verify_environment():
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")  # Added missing )
    print(f"NumPy version: {np.__version__}")
    print(f"Matplotlib version: {matplotlib.__version__}")

verify_environment()

PyTorch version: 2.0.1+cu117
CUDA available: False
NumPy version: 1.23.5
Matplotlib version: 3.7.1


In [22]:
class BoopNet(nn.Module):
    def __init__(self, grid_size=5):
        super(BoopNet, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(4, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.policy_head = nn.Sequential(
            nn.Conv2d(128, 2, 1),
            nn.BatchNorm2d(2),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(2 * grid_size * grid_size, grid_size**2)
        )
        self.value_head = nn.Sequential(
            nn.Conv2d(128, 1, 1),
            nn.BatchNorm2d(1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(grid_size * grid_size, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.conv(x)
        policy = self.policy_head(x)
        value = self.value_head(x)
        return policy, value

In [23]:
class Node:
    def __init__(self, game_state, parent=None):
        self.game_state = game_state
        self.parent = parent
        self.children = []
        self.wins = 0
        self.visits = 0
        self.untried_actions = game_state.get_valid_moves()

class MCTS:
    def __init__(self, agent, simulations=200):
        self.agent = agent
        self.simulations = simulations

    def search(self, initial_state):
        root = Node(initial_state)

        for _ in range(self.simulations):
            node = root
            state = initial_state.copy()

            # Selection
            while node.untried_actions == [] and node.children != []:
                node = self.select_child(node)
                state.step(node.action)

            # Expansion
            if node.untried_actions:
                action = random.choice(node.untried_actions)
                state.step(action)
                node = node.add_child(state, action)

            # Simulation
            with torch.no_grad():
                policy, value = self.agent.model(torch.FloatTensor(state.board).unsqueeze(0).unsqueeze(0))

            # Backpropagation
            while node:
                node.visits += 1
                node.wins += value.item()
                node = node.parent

        return max(root.children, key=lambda c: c.visits).action

In [24]:
class AlphaBoopAgent:
    def __init__(self, grid_size=5):
        self.grid_size = grid_size
        self.model = BoopNet(grid_size)
        self.optimizer = optim.Adam(self.model.parameters(), weight_decay=1e-4)
        self.mcts = MCTS(self)
        self.memory = deque(maxlen=10000)

    def self_play(self, num_games=100):
        for _ in range(num_games):
            game = BoopGame()
            memory_episode = []

            while not game.game_over:
                action = self.mcts.search(game.get_state())
                memory_episode.append((
                    game.get_state().copy(),
                    action,
                    game.current_player
                ))
                game.step(action)

            self.process_episode(memory_episode, game.winner)

    def process_episode(self, memory, winner):
        for state, action, player in memory:
            reward = 0
            if player == winner:
                reward += 10
            else:
                reward -= 5

            # Additional rewards
            if self.check_graduation(state):
                reward += 3
            if self.check_cat_placement(state, action):
                reward += 1

            self.memory.append((state, action, reward))

    def train(self, batch_size=512, epochs=10):
        states, actions, rewards = zip(*random.sample(self.memory, batch_size))

        states = torch.FloatTensor(np.array(states)).unsqueeze(1)
        actions = torch.LongTensor([self._action_to_index(a) for a in actions])
        rewards = torch.FloatTensor(rewards)

        for _ in range(epochs):
            policy_pred, value_pred = self.model(states)

            policy_loss = F.cross_entropy(policy_pred, actions)
            value_loss = F.mse_loss(value_pred.squeeze(), rewards)
            loss = policy_loss + value_loss

            self.optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
            self.optimizer.step()

In [25]:
def augment_state(state):
    augmented = []
    for k in range(4):
        rotated = np.rot90(state, k)
        augmented.append(rotated)
        augmented.append(np.fliplr(rotated))
    return augmented

In [26]:
def train_progressive(agent):
    # Phase 1: Basic positioning
    agent.self_play(num_games=1000)
    agent.train(batch_size=256)

    # Phase 2: Strategic patterns
    agent.mcts.simulations = 500
    agent.self_play(num_games=5000)
    agent.train(batch_size=1024)

    # Phase 3: Expert refinement
    agent.mcts.simulations = 1000
    agent.self_play(num_games=10000)
    agent.train(batch_size=2048)

In [27]:
class Evaluator:
    def __init__(self, agent):
        self.agent = agent
        self.base_agents = {
            'random': RandomAgent(),
            'greedy': GreedyAgent()
        }

    def benchmark(self, num_games=100):
        results = {}
        for name, opponent in self.base_agents.items():
            wins = 0
            for _ in range(num_games):
                game = BoopGame()
                while not game.game_over:
                    if game.current_player == 1:
                        action = self.agent.mcts.search(game.get_state())
                    else:
                        action = opponent.act(game)
                    game.step(action)
                if game.winner == 1:
                    wins += 1
            results[name] = wins / num_games
        return results

In [28]:
class BoopGame:
    def __init__(self, size=8):
        self.size = size
        # 3-layer board: [0] = player 1 kittens, [1] = player 1 cats, [2] = player 2 kittens, [3] = player 2 cats
        self.board = np.zeros((4, size, size), dtype=int)
        self.pool = {
            1: {"kittens": 8, "cats": 0},
            2: {"kittens": 8, "cats": 0}
        }
        self.current_player = 1
        self.winner = None
        self.game_over = False

    def get_valid_moves(self, is_cat=False):
        # Valid moves depend on pool and empty spaces
        if is_cat and self.pool[self.current_player]["cats"] < 1:
            return []
        return np.argwhere(np.sum(self.board, axis=0) == 0).tolist()

In [29]:
def apply_boop(self, row, col, is_cat):
    directions = [(-1, -1), (-1, 0), (-1, 1),
                  (0, -1),          (0, 1),
                  (1, -1),  (1, 0), (1, 1)]

    for dr, dc in directions:
        r, c = row + dr, col + dc
        if 0 <= r < self.size and 0 <= c < self.size:
            # Check if piece can be booped (cats can't be booped by kittens)
            for layer in range(4):
                if self.board[layer, r, c] == 1:
                    if (is_cat or layer % 2 == 0):  # Cats can boop anything, kittens only boop kittens
                        new_r, new_c = r + dr, c + dc
                        if 0 <= new_r < self.size and 0 <= new_c < self.size:
                            self._move_piece(layer, r, c, new_r, new_c)
                        else:
                            self._return_to_pool(layer, r, c)

def _move_piece(self, layer, old_r, old_c, new_r, new_c):
    # Handle collisions and chain reactions
    if np.sum(self.board[:, new_r, new_c]) == 0:
        self.board[layer, new_r, new_c] = 1
        self.board[layer, old_r, old_c] = 0

In [30]:
def check_graduation(self):
    # Check all rows, columns, diagonals for 3 in a line
    for player in [1, 2]:
        kitten_layer = 0 if player == 1 else 2
        mask = self.board[kitten_layer]
        lines = self.find_lines(mask)

        if lines:
            # Remove kittens and add cats to pool
            self.pool[player]["kittens"] += 3
            self.pool[player]["cats"] += 3
            self.remove_kittens(lines)
            return True
    return False

def check_win(self):
    # Check for 3 cats in a line or all cats on board
    for player in [1, 2]:
        cat_layer = 1 if player == 1 else 3
        if self.find_lines(self.board[cat_layer], length=3):
            return player
        if np.sum(self.board[cat_layer]) == 8:
            return player
    return None

In [31]:
class RandomAgent:
    def act(self, game):
        return random.choice(game.get_valid_moves())

class GreedyAgent:
    def act(self, game):
        # Implement simple heuristic-based agent
        valid_moves = game.get_valid_moves()
        return max(valid_moves, key=lambda m: self.evaluate_move(game, m))

    def evaluate_move(self, game, move):
        # Add position evaluation logic
        return 0  # Placeholder

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class BoopNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(4, 128, 3, padding=1),  # 4 input channels
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.policy_head = nn.Sequential(
            nn.Conv2d(128, 2, 1),
            nn.Flatten(),
            nn.Linear(2*8*8, 8*8)
        )
        self.value_head = nn.Sequential(
            nn.Conv2d(128, 1, 1),
            nn.Flatten(),
            nn.Linear(8*8, 1),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.conv(x)
        policy = self.policy_head(x)
        value = self.value_head(x)
        return policy, value

Using device: cpu


In [33]:
def test_boop_mechanics():
    game = BoopGame(size=3)
    # Place piece at (1,1) and verify booping
    game.step((1, 1, False))  # Kitten placement
    assert game.board[0, 1, 1] == 1
    # Add adjacent piece and test boop
    game.current_player = 2
    game.step((0, 0, False))
    assert game.board[2, 0, 0] == 0  # Should be booped off

In [34]:
def plot_board(game):
    fig, ax = plt.subplots()
    ax.matshow(game.board[0] - game.board[2] + 2*(game.board[1] - game.board[3]))
    plt.show()

In [35]:
def train():
    # Phase 1: Basic placement (no cats)
    agent.self_play(num_games=1000, use_cats=False)

    # Phase 2: Introduce graduation
    agent.self_play(num_games=5000)

    # Phase 3: Full rules
    agent.mcts.simulations = 1000
    agent.self_play(num_games=10000)

In [36]:
training_config = {
    'episodes': 10000,
    'batch_size': 512,
    'learning_rate': 0.001,
    'gamma': 0.99,
    'tau': 0.005,  # For soft updates
    'memory_size': 100000
}