In [1]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from collections import namedtuple, deque
import itertools
import random
import math

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

In [3]:
N = 8

## Setup
1. Setting up the environment class
2. Setting up the model class
3. Defining an optimizer function
4. Training the model
5. Testing the model


I use `row major` order.

## Environment

In [4]:
class NQueensEnv:
    def __init__(self, n):
        self.board_size = n
        self.reset()

    def reset(self):
        self.board = torch.zeros((self.board_size, self.board_size), dtype=torch.float32)
        return self.board.clone().flatten()
    
    def set_random_partial(self, k):
        has_set = False
        while not has_set:
            self.reset()
            avail_squares = list(itertools.product(range(self.board_size), range(self.board_size)))
            used = []
            rows, cols, diags, anti = set(), set(), set(), set()
            num_set = 0
            while num_set < k and avail_squares:
                row,col = random.sample(avail_squares)
                rows.add(row)
                cols.add(col)
                diags.add(col-row)
                anti.add(col+row)
                
                board[row][col] = 1
                used.append((row,col))
                
                avail_squares = [(i,j) 
                                 for (i,j) in itertools.product(range(self.board_size), range(self.board_size))
                                 if (not i in row and not j in col and not j-i in diags and not j+i in anti)]
                
            has_set = num_set == k
            
    def step(self, action):
        # The logic to place a queen and return the new state, reward, and done status
        row, col = action//self.board_size, action%self.board_size

        self.board[row][col] = 1
        done = self._check_done()
        reward = 1 if done else 0  # Only reward completing the puzzle
        return self.board.clone().flatten(), reward, done
        
    def _check_done(self):
        return sum(sum(row) for row in self.board) == self.board_size
        
    def get_random_action(self):
        valid_actions_mask = self.valid_actions_mask()
        valid_actions_mask.tolist()
        print(valid_actions_mask)
        avail_squares = [(i * self.board_size + j) 
                         for (i,j) in itertools.product(range(self.board_size), range(self.board_size))
                         if valid_actions_mask[self.board_size * i + j] # Row major
                        ]
        return random.sample(avail_squares, 1) if avail_squares else None
    
    def valid_actions_mask(self) -> torch.Tensor:
        rows,cols,diags,anti_diags = set(), set(), set(), set()
        valid_actions_mask = [1] * self.board_size**2
        for row in range(self.board_size):
            for col in range(self.board_size):
                if self.board[row][col] == 1:
                    rows.add(row)
                    cols.add(col)
                    diags.add(col-row)
                    anti_diags.add(col+row)
                    
        for row in range(self.board_size):
            for col in range(self.board_size):
                valid_actions_mask[self.board_size*row + col] = 1 - int(row in rows or col in cols or col-row in diags or col+row in anti_diags)
        
        return torch.as_tensor(valid_actions_mask)
        
    def render(self):
        # TODO: Implement a method to visualize the board state
        pass

In [5]:
env = NQueensEnv(8)
env.get_random_action(), env.reset()

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


([57],
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

## Model

In [6]:
# Model
class DQN(nn.Module):
    def __init__(self, board_size):
        super(DQN, self).__init__()
        self.board_size = board_size
        self.fc1 = nn.Linear(board_size * board_size, 512)
        self.fc2 = nn.Linear(512, 512)
        self.out = nn.Linear(512, board_size * board_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)

In [7]:
policy_net = DQN(N).to(device)
target_net = DQN(N).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
target_net

DQN(
  (fc1): Linear(in_features=64, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=64, bias=True)
)

In [8]:
# Replay memory to avoid temporal correlation
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

### Training - helpers

In [9]:
env = NQueensEnv(N)
env

<__main__.NQueensEnv at 0x2ddc1a48f70>

In [10]:
BATCH_SIZE = 128
GAMMA = 0.999 # discount val
EPS_START = 0.9 # Initial threshold
EPS_END = 0.05 # Final threshold
EPS_DECAY = 200 # higher EPS_DECAY => slower decay
# TARGET_UPDATE = 10

optimizer = optim.Adam(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0

def select_action(state):
    global steps_done  # Ensure steps_done is accessible and can be modified
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    
    if random.random() > eps_threshold:
        with torch.no_grad():
            # Get the output from the policy network
            action_values = policy_net(state)
            print(action_values, action_values.shape, "###")
            # Generate a mask for valid actions
            valid_actions_mask = env.valid_actions_mask()  # Assume this method exists and returns a tensor of 0s and 1s
            # Apply the mask to zero out invalid actions
            masked_action_values = action_values * valid_actions_mask
            # Choose the action with the highest value that is also valid
            action = masked_action_values.max(0)[1].view(1, 1)
            return action
    else:
        # Exploratory action path        
        action = env.get_random_action()
        if action is None:
            return None
        return torch.tensor([action], device=device, dtype=torch.long)

In [11]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return  # Not enough samples in memory to perform optimization

    transitions = memory.sample(BATCH_SIZE)
    # Convert batch-array of Transitions to Transition of batch-arrays
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                            batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                        if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

### Training

In [12]:
N = 8

In [14]:
# Parameters
num_episodes = 1000  # Number of episodes to train on
TARGET_UPDATE = 10  # How often to update the target network

# Initialize the environment and the DQN model
env = NQueensEnv(N)

optimizer = optim.Adam(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0  # Counter for the decaying exploration rate

for i_episode in range(num_episodes):
    # Initialize the state
    state = env.reset()
    state = torch.tensor(state, device=device, dtype=torch.float32)  # Reshape state to match DQN input
    
    for t in itertools.count():  # count() is an alternative to while True with an implicit break
        # Select and perform an action
        action = select_action(state)
        if action is None:
            next_state = state
            done = True
            reward = -1
        else:
            next_state, reward, done = env.step(action.item())  # Assuming env.step() is correctly implemented
        
        reward = torch.tensor([reward], device=device)

        print("#", next_state)
        # Observe new state
        if not done:
            next_state = torch.tensor(next_state, device=device, dtype=torch.float32)
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()
        
        if done:
            break

    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')


  state = torch.tensor(state, device=device, dtype=torch.float32)  # Reshape state to match DQN input
  next_state = torch.tensor(next_state, device=device, dtype=torch.float32)


tensor([-0.0071, -0.0335,  0.0541, -0.0319,  0.0193,  0.0220, -0.0362, -0.0150,
        -0.0092, -0.0008,  0.0068,  0.0130, -0.0104,  0.0329,  0.0448,  0.0370,
        -0.0242,  0.0526,  0.0241,  0.0027, -0.0070,  0.0398,  0.0228,  0.0075,
         0.0001, -0.0018, -0.0277, -0.0016,  0.0406,  0.0348, -0.0167,  0.0207,
         0.0182, -0.0417, -0.0090,  0.0034,  0.0020,  0.0118, -0.0105,  0.0122,
        -0.0480,  0.0128,  0.0220, -0.0309,  0.0096,  0.0155,  0.0349,  0.0059,
         0.0026, -0.0141,  0.0123,  0.0368,  0.0018, -0.0095,  0.0566, -0.0355,
         0.0176,  0.0043,  0.0116,  0.0168, -0.0295, -0.0441, -0.0411,  0.0035]) torch.Size([64]) ###
# tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 

# tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 1., 0., 0., 0., 0.])
tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 1., 0., 0., 0., 0.])
tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

TypeError: expected Tensor as element 4 in argument 0, but got NoneType