Based on

- https://github.com/pytorch/examples/blob/87d9a1e930b5b813/reinforcement_learning/reinforce.py
- https://medium.com/@ts1829/policy-gradient-reinforcement-learning-in-pytorch-df1383ea0baf [Notebook](https://nbviewer.jupyter.org/urls/gist.githubusercontent.com/ts1829/ebbe2cf946bf36951b724818c52e36b9/raw/4da449bffe9835e201f2fb34f381fbb53568d1ca/Policy%20Gradient%20with%20Cartpole%20and%20PyTorch%20%28Medium%20Version%29.ipynb)

In [136]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import count
import itertools
from collections import namedtuple, defaultdict
from tqdm import tqdm, trange
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline

import random
from tuplestate import *
from gamestate import *
from benchmarking import *
from vectorize import *
random.seed(0)
# print(to_pretty_string(klonstate))

In [101]:
all_solutions = os.listdir('./bench/shootme/')

def solve_state(ret):
    lines = ret.splitlines()
    result = lines[15]
    if result.startswith('Minimal solution'):
        return "Solved-Min"
    elif result.startswith("Solved"):
        return "Solved"
    elif result.startswith('Impossible'):
        return "Impossible"
    elif result.startswith('Unknown'):
        return "Unknown"

def clf_seeds(seedlist):
    results = defaultdict(set)
    for seed in seedlist:
        with open(f"./bench/shootme/{seed}") as f:
            ret = f.read()
            result = solve_state(ret)
            results[result].add(seed)
    return results

def clf_summary(seedlist):
    results = clf_seeds(seedlist)
    states = ['Solved-Min', 'Solved', 'Impossible', 'Unknown']
    for clfstate in states:
        seeds = results[clfstate]
        print(f"{clfstate:12} {len(seeds):8,}")
        total = sum(len(s) for s in results.values())
    print(('-'*12) + '-' + ('-'*8))
    print(f"{'Total':12} {total:8,}") 
          
def get_state(ret):
    deck_json = convert_shootme_to_solvitaire_json(ret)
    return init_from_solvitaire(deck_json)

def map_seeds_to_states(seed_seq):
    for seed in seed_seq:
        with open(f"bench/shootme/{seed}") as f:
            ret = f.read()
            state = get_state(ret)
            yield seed, state

print("All seeds")
clf_summary(all_solutions)

All seeds
Solved-Min      7,876
Solved            320
Impossible      1,282
Unknown           522
---------------------
Total          10,000


- In features: game state vector of size $233 \times 104$

- Out features: legal moves vector of size $623$

In [172]:
IN = 233*104
OUT = 623

Args = namedtuple('Args', 'gamma lr seed render log_interval')
args = Args(
    gamma=0.5,
    lr=5e-2,
    seed=1,
    render=False,
    log_interval=20)


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(IN, 800)
        self.dropout = nn.Dropout(p=0.6)
        self.affine2 = nn.Linear(800, OUT)
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        x = F.relu(x)
        x = self.dropout(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

def select_action(klonstate):
    state_vec = state_to_vec(klonstate)
    movefilter = vector_legal_moves(klonstate)
    torch_state_vec = torch.from_numpy(state_vec).float().reshape(-1).unsqueeze(0)
    torch_filter = torch.from_numpy(movefilter.astype(np.float32)).unsqueeze(0)
    probs = policy(torch_state_vec) * torch_filter
    if (probs == 0).all():
        torch_filter.requires_grad_()
        # sample all legal moves with uniform probability
        m = Categorical(torch_filter)
    else:
        # :attr:`probs` will be normalized to sum to 1
        m = Categorical(probs)
    action = m.sample()
    log_prob = m.log_prob(action)
    policy.saved_log_probs.append(log_prob)
    return action.item()

def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + args.gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]
    
def step(curr_state, move_code):
    new_state = play_move(curr_state, move_code)
    reward = 0  
    if state_is_win(new_state) or all_cards_faceup(new_state):
        reward = 1
    if not state_is_legal(new_state):
        print('\n got illegal state by playing move', move_code)
        print('prev state')
        print(to_pretty_string(curr_state))
        print('\nnew state')
        print(to_pretty_string(new_state))
        assert state_is_legal(new_state), 'got illegal state'
    return new_state, reward

def score_state(klonstate):
    fnds = [klonstate.foundation1, klonstate.foundation2, 
            klonstate.foundation3, klonstate.foundation4]
    return sum(map(len, fnds))

In [174]:
# policy = Policy()
# optimizer = optim.SGD(policy.parameters(), lr=1e-2)
# eps = np.finfo(np.float32).eps.item()

# random.seed(0)
# train_seeds = random.sample(all_solutions, k=100)
# print('Training game set')
# clf_summary(train_seeds)

training_games = map_seeds_to_states(all_solutions)

for i, env in enumerate(training_games):
    seed, klonstate = env
#     print(seed)
    ep_reward = 0
    done = False
    path = []
    visited = set()
    for t in range(1, 1000):
        action_idx = select_action(klonstate)
        move_code = all_moves[action_idx]
        klonstate, reward = step(klonstate, move_code)
        path.append(move_code)
        if klonstate in visited:
            reward = -1
        else:
            visited.add(klonstate)
        done = reward != 0
#         if reward > 0:
#             # if it didnt hit a cycle then reward = state score
#             reward = score_state(klonstate)
#         elif reward < 0: # hit cycle
#             # few steps hitting a cycle gets large negative reward
#             # many steps hitting a cycle gets a smaller negative reward
#             reward = 10/(t * reward)
        policy.rewards.append(reward)
        ep_reward += reward
        if done:
#             print(f'{seed:11} done after {t:4} steps {ep_reward:.2f}')
            break
    print(f"{i:5} ep reward {ep_reward:.2f}   steps {t:3}")#, end=' ')
#     print('final score', score_state(klonstate))
#     print('path', path)
#     print('final state')
#     print(to_pretty_string(klonstate))
    
    finish_episode()

ep reward -1.00   steps  18   
ep reward -1.00   steps   4   
ep reward -1.00   steps  14   
ep reward -1.00   steps   8   
ep reward -1.00   steps   3   
ep reward -1.00   steps   5   
ep reward -1.00   steps   4   
ep reward -1.00   steps   4   
ep reward -1.00   steps   3   
ep reward -1.00   steps  15   
ep reward -1.00   steps   6   
ep reward -1.00   steps   3   
ep reward -1.00   steps  20   
ep reward -1.00   steps   5   
ep reward -1.00   steps   4   
ep reward -1.00   steps   6   
ep reward -1.00   steps   6   
ep reward -1.00   steps   7   
ep reward -1.00   steps   3   
ep reward -1.00   steps   5   
ep reward -1.00   steps   5   
ep reward -1.00   steps   4   
ep reward -1.00   steps   5   
ep reward -1.00   steps   5   
ep reward -1.00   steps  14   
ep reward -1.00   steps   4   
ep reward -1.00   steps  11   
ep reward -1.00   steps   7   
ep reward -1.00   steps   8   
ep reward -1.00   steps  10   
ep reward -1.00   steps  14   
ep reward -1.00   steps  10   
ep rewar

ep reward -1.00   steps  13   
ep reward -1.00   steps   7   
ep reward -1.00   steps   7   
ep reward -1.00   steps   5   
ep reward -1.00   steps   4   
ep reward -1.00   steps   6   
ep reward -1.00   steps   8   
ep reward -1.00   steps   9   
ep reward -1.00   steps   7   
ep reward -1.00   steps   5   
ep reward -1.00   steps   7   
ep reward -1.00   steps  14   
ep reward -1.00   steps   7   
ep reward -1.00   steps   9   
ep reward -1.00   steps   5   
ep reward -1.00   steps  20   
ep reward -1.00   steps   8   
ep reward -1.00   steps   6   
ep reward -1.00   steps  16   
ep reward -1.00   steps   4   
ep reward -1.00   steps  17   
ep reward -1.00   steps   3   
ep reward -1.00   steps  10   
ep reward -1.00   steps   5   
ep reward -1.00   steps   8   
ep reward -1.00   steps   4   
ep reward -1.00   steps   6   
ep reward -1.00   steps  10   
ep reward -1.00   steps  12   
ep reward -1.00   steps   9   
ep reward -1.00   steps   7   
ep reward -1.00   steps   7   
ep rewar

ep reward -1.00   steps   9   
ep reward -1.00   steps  53   
ep reward -1.00   steps  14   
ep reward -1.00   steps   3   
ep reward -1.00   steps   3   
ep reward -1.00   steps  43   
ep reward -1.00   steps  15   
ep reward -1.00   steps  18   
ep reward -1.00   steps  10   
ep reward -1.00   steps  29   
ep reward -1.00   steps   4   
ep reward -1.00   steps  10   
ep reward -1.00   steps   4   
ep reward -1.00   steps   9   
ep reward -1.00   steps  17   
ep reward -1.00   steps  17   
ep reward -1.00   steps  13   
ep reward -1.00   steps  15   
ep reward -1.00   steps  27   
ep reward -1.00   steps   7   
ep reward -1.00   steps  19   
ep reward -1.00   steps   4   
ep reward -1.00   steps  21   
ep reward -1.00   steps   6   
ep reward -1.00   steps  19   
ep reward -1.00   steps  19   
ep reward -1.00   steps   8   
ep reward -1.00   steps  21   
ep reward -1.00   steps   9   
ep reward -1.00   steps   4   
ep reward -1.00   steps  12   
ep reward -1.00   steps  12   
ep rewar

ep reward -1.00   steps  25   
ep reward -1.00   steps  52   
ep reward -1.00   steps  37   
ep reward -1.00   steps  37   
ep reward -1.00   steps  54   
ep reward -1.00   steps  17   
ep reward -1.00   steps  22   
ep reward -1.00   steps  14   
ep reward -1.00   steps  49   
ep reward -1.00   steps  79   
ep reward -1.00   steps  38   
ep reward -1.00   steps  48   
ep reward -1.00   steps   9   
ep reward -1.00   steps  26   
ep reward -1.00   steps  30   
ep reward -1.00   steps  15   
ep reward -1.00   steps   9   
ep reward -1.00   steps   9   
ep reward -1.00   steps  14   
ep reward -1.00   steps  14   
ep reward -1.00   steps  36   
ep reward -1.00   steps  26   
ep reward -1.00   steps  29   
ep reward -1.00   steps  15   
ep reward -1.00   steps  12   
ep reward -1.00   steps  12   
ep reward -1.00   steps  27   
ep reward -1.00   steps  32   
ep reward -1.00   steps  17   
ep reward -1.00   steps  14   
ep reward -1.00   steps  25   
ep reward -1.00   steps  22   
ep rewar

ep reward -1.00   steps  37   
ep reward -1.00   steps  29   
ep reward -1.00   steps  21   
ep reward -1.00   steps  15   
ep reward -1.00   steps  51   
ep reward -1.00   steps  18   
ep reward -1.00   steps  24   
ep reward -1.00   steps  17   
ep reward -1.00   steps  20   
ep reward -1.00   steps  60   
ep reward -1.00   steps   7   
ep reward -1.00   steps  44   
ep reward -1.00   steps  47   
ep reward -1.00   steps  11   
ep reward -1.00   steps  36   
ep reward -1.00   steps  13   
ep reward -1.00   steps  33   
ep reward -1.00   steps  30   
ep reward -1.00   steps  45   
ep reward -1.00   steps  14   
ep reward -1.00   steps  26   
ep reward -1.00   steps  28   
ep reward -1.00   steps  51   
ep reward -1.00   steps  33   
ep reward -1.00   steps   9   
ep reward -1.00   steps  37   
ep reward -1.00   steps  28   
ep reward -1.00   steps  25   
ep reward -1.00   steps  31   
ep reward -1.00   steps  33   
ep reward -1.00   steps  39   
ep reward -1.00   steps  22   
ep rewar

ep reward -1.00   steps  32   
ep reward -1.00   steps  30   
ep reward -1.00   steps  30   
ep reward -1.00   steps  44   
ep reward -1.00   steps  22   
ep reward -1.00   steps  55   
ep reward -1.00   steps  30   
ep reward -1.00   steps  13   
ep reward -1.00   steps  19   
ep reward -1.00   steps  33   
ep reward -1.00   steps  42   
ep reward -1.00   steps  26   
ep reward -1.00   steps  19   
ep reward -1.00   steps   8   
ep reward -1.00   steps  44   
ep reward -1.00   steps  24   
ep reward -1.00   steps  45   
ep reward -1.00   steps  22   
ep reward -1.00   steps  65   
ep reward -1.00   steps  42   
ep reward -1.00   steps  35   
ep reward -1.00   steps  16   
ep reward -1.00   steps  30   
ep reward -1.00   steps  49   
ep reward -1.00   steps  26   
ep reward -1.00   steps  22   
ep reward -1.00   steps   9   
ep reward -1.00   steps  30   
ep reward -1.00   steps  44   
ep reward -1.00   steps  28   
ep reward -1.00   steps   9   
ep reward -1.00   steps  23   
ep rewar

ep reward -1.00   steps  34   
ep reward -1.00   steps  31   
ep reward -1.00   steps  12   
ep reward -1.00   steps  38   
ep reward -1.00   steps  34   
ep reward -1.00   steps  56   
ep reward -1.00   steps  44   
ep reward -1.00   steps  26   
ep reward -1.00   steps  38   
ep reward -1.00   steps  42   
ep reward -1.00   steps  20   
ep reward -1.00   steps   9   
ep reward -1.00   steps  41   
ep reward -1.00   steps  36   
ep reward -1.00   steps  26   
ep reward -1.00   steps  40   
ep reward -1.00   steps  26   
ep reward -1.00   steps  51   
ep reward -1.00   steps  42   
ep reward -1.00   steps  19   
ep reward -1.00   steps  17   
ep reward -1.00   steps  29   
ep reward -1.00   steps  44   
ep reward -1.00   steps  26   
ep reward -1.00   steps  45   
ep reward -1.00   steps  28   
ep reward -1.00   steps  23   
ep reward -1.00   steps  56   
ep reward -1.00   steps  14   
ep reward -1.00   steps  27   
ep reward -1.00   steps  24   
ep reward -1.00   steps  41   
ep rewar

ep reward -1.00   steps  29   
ep reward -1.00   steps  29   
ep reward -1.00   steps  17   
ep reward -1.00   steps  23   
ep reward -1.00   steps  23   
ep reward -1.00   steps   9   
ep reward -1.00   steps  41   
ep reward -1.00   steps  37   
ep reward -1.00   steps  17   
ep reward -1.00   steps  21   
ep reward -1.00   steps  49   
ep reward -1.00   steps  28   
ep reward -1.00   steps  23   
ep reward -1.00   steps  32   
ep reward -1.00   steps  22   
ep reward -1.00   steps  10   
ep reward -1.00   steps  46   
ep reward -1.00   steps  12   
ep reward -1.00   steps  21   
ep reward -1.00   steps  32   
ep reward -1.00   steps  38   
ep reward -1.00   steps  30   
ep reward -1.00   steps  50   
ep reward -1.00   steps  23   
ep reward -1.00   steps  42   
ep reward -1.00   steps  29   
ep reward -1.00   steps  48   
ep reward -1.00   steps  33   
ep reward -1.00   steps  33   
ep reward -1.00   steps  21   
ep reward -1.00   steps  20   
ep reward -1.00   steps  27   
ep rewar

ep reward -1.00   steps  23   
ep reward -1.00   steps  29   
ep reward -1.00   steps  29   
ep reward -1.00   steps  38   
ep reward -1.00   steps  18   
ep reward -1.00   steps  24   
ep reward -1.00   steps  31   
ep reward -1.00   steps  37   
ep reward -1.00   steps  29   
ep reward -1.00   steps  30   
ep reward -1.00   steps  30   
ep reward -1.00   steps  32   
ep reward -1.00   steps  35   
ep reward -1.00   steps  19   
ep reward -1.00   steps  44   
ep reward -1.00   steps  32   
ep reward -1.00   steps  20   
ep reward -1.00   steps  48   
ep reward -1.00   steps  22   
ep reward -1.00   steps  26   
ep reward -1.00   steps  26   
ep reward -1.00   steps  23   
ep reward -1.00   steps  43   
ep reward -1.00   steps  30   
ep reward -1.00   steps  50   
ep reward -1.00   steps  24   
ep reward -1.00   steps  43   
ep reward -1.00   steps  28   
ep reward -1.00   steps  23   
ep reward -1.00   steps  26   
ep reward -1.00   steps  12   
ep reward -1.00   steps  34   
ep rewar

ep reward -1.00   steps  40   
ep reward -1.00   steps  42   
ep reward -1.00   steps  39   
ep reward -1.00   steps  30   
ep reward -1.00   steps  24   
ep reward -1.00   steps  17   
ep reward -1.00   steps  24   
ep reward -1.00   steps  39   
ep reward -1.00   steps  38   
ep reward -1.00   steps  38   
ep reward -1.00   steps  43   
ep reward -1.00   steps  14   
ep reward -1.00   steps  17   
ep reward -1.00   steps  45   
ep reward -1.00   steps  16   
ep reward -1.00   steps  24   
ep reward -1.00   steps  21   
ep reward -1.00   steps  35   
ep reward -1.00   steps  20   
ep reward -1.00   steps  25   
ep reward -1.00   steps  26   
ep reward -1.00   steps  23   
ep reward -1.00   steps  43   
ep reward -1.00   steps  36   
ep reward -1.00   steps  21   
ep reward -1.00   steps  22   
ep reward -1.00   steps  46   
ep reward -1.00   steps  16   
ep reward -1.00   steps  44   
ep reward -1.00   steps  46   
ep reward -1.00   steps  14   
ep reward -1.00   steps  48   
ep rewar

ep reward -1.00   steps  20   
ep reward -1.00   steps  59   
ep reward -1.00   steps  27   
ep reward -1.00   steps  19   
ep reward -1.00   steps  34   
ep reward -1.00   steps  46   
ep reward -1.00   steps  27   
ep reward -1.00   steps  34   
ep reward -1.00   steps  35   
ep reward -1.00   steps  32   
ep reward -1.00   steps  36   
ep reward -1.00   steps  53   
ep reward -1.00   steps  32   
ep reward -1.00   steps  42   
ep reward -1.00   steps  32   
ep reward -1.00   steps  28   
ep reward -1.00   steps  44   
ep reward -1.00   steps  23   
ep reward -1.00   steps  34   
ep reward -1.00   steps  14   
ep reward -1.00   steps  17   
ep reward -1.00   steps  30   
ep reward -1.00   steps  35   
ep reward -1.00   steps  28   
ep reward -1.00   steps  31   
ep reward -1.00   steps  37   
ep reward -1.00   steps  27   
ep reward -1.00   steps  45   
ep reward -1.00   steps  29   
ep reward -1.00   steps  21   
ep reward -1.00   steps  52   
ep reward -1.00   steps  45   
ep rewar

ep reward -1.00   steps  30   
ep reward -1.00   steps  42   
ep reward -1.00   steps  25   
ep reward -1.00   steps   9   
ep reward -1.00   steps  24   
ep reward -1.00   steps  15   
ep reward -1.00   steps  33   
ep reward -1.00   steps   9   
ep reward -1.00   steps  34   
ep reward -1.00   steps  26   
ep reward -1.00   steps  18   
ep reward -1.00   steps  29   
ep reward -1.00   steps  42   
ep reward -1.00   steps  22   
ep reward -1.00   steps  24   
ep reward -1.00   steps  14   
ep reward -1.00   steps  15   
ep reward -1.00   steps  11   
ep reward -1.00   steps  37   
ep reward -1.00   steps  42   
ep reward -1.00   steps   8   
ep reward -1.00   steps  39   
ep reward -1.00   steps  43   
ep reward -1.00   steps  24   
ep reward -1.00   steps  20   
ep reward -1.00   steps  23   
ep reward -1.00   steps  45   
ep reward -1.00   steps   4   
ep reward -1.00   steps  17   
ep reward -1.00   steps  10   
ep reward -1.00   steps   9   
ep reward -1.00   steps  30   
ep rewar

ep reward -1.00   steps  33   
ep reward -1.00   steps  40   
ep reward -1.00   steps  27   
ep reward -1.00   steps  38   
ep reward -1.00   steps  34   
ep reward -1.00   steps  28   
ep reward -1.00   steps  18   
ep reward -1.00   steps  23   
ep reward -1.00   steps  18   
ep reward -1.00   steps  56   
ep reward -1.00   steps   9   
ep reward -1.00   steps  51   
ep reward -1.00   steps  35   
ep reward -1.00   steps  35   
ep reward -1.00   steps  33   
ep reward -1.00   steps  24   
ep reward -1.00   steps  19   
ep reward -1.00   steps  23   
ep reward -1.00   steps  13   
ep reward -1.00   steps  32   
ep reward -1.00   steps  11   
ep reward -1.00   steps  36   
ep reward -1.00   steps  27   
ep reward -1.00   steps  45   
ep reward -1.00   steps  25   
ep reward -1.00   steps  29   
ep reward -1.00   steps  52   
ep reward -1.00   steps  23   
ep reward -1.00   steps  44   
ep reward -1.00   steps  24   
ep reward -1.00   steps  30   
ep reward -1.00   steps  42   
ep rewar

ep reward -1.00   steps  16   
ep reward -1.00   steps  22   
ep reward -1.00   steps  43   
ep reward -1.00   steps  30   
ep reward -1.00   steps  26   
ep reward -1.00   steps  28   
ep reward -1.00   steps  24   
ep reward -1.00   steps  36   
ep reward -1.00   steps  41   
ep reward -1.00   steps  28   
ep reward -1.00   steps  40   
ep reward -1.00   steps  18   
ep reward -1.00   steps  43   
ep reward -1.00   steps  28   
ep reward -1.00   steps   9   
ep reward -1.00   steps  25   
ep reward -1.00   steps  44   
ep reward -1.00   steps  36   
ep reward -1.00   steps  37   
ep reward -1.00   steps  38   
ep reward -1.00   steps  36   
ep reward -1.00   steps  35   
ep reward -1.00   steps  29   
ep reward -1.00   steps  27   
ep reward -1.00   steps  11   
ep reward -1.00   steps  18   
ep reward -1.00   steps  36   
ep reward -1.00   steps  45   
ep reward -1.00   steps  35   
ep reward -1.00   steps  28   
ep reward -1.00   steps  28   
ep reward -1.00   steps  58   
ep rewar

ep reward -1.00   steps  31   
ep reward -1.00   steps  22   
ep reward -1.00   steps  43   
ep reward -1.00   steps  22   
ep reward -1.00   steps  52   
ep reward -1.00   steps  44   
ep reward -1.00   steps  37   
ep reward -1.00   steps  53   
ep reward -1.00   steps  36   
ep reward -1.00   steps  38   
ep reward -1.00   steps  42   
ep reward -1.00   steps  31   
ep reward -1.00   steps  32   
ep reward -1.00   steps  10   
ep reward -1.00   steps  56   
ep reward -1.00   steps  38   
ep reward -1.00   steps  44   
ep reward -1.00   steps  42   
ep reward -1.00   steps  39   
ep reward -1.00   steps  26   
ep reward -1.00   steps  26   
ep reward -1.00   steps  20   
ep reward -1.00   steps  41   
ep reward -1.00   steps  23   
ep reward -1.00   steps  31   
ep reward -1.00   steps  41   
ep reward -1.00   steps  46   
ep reward -1.00   steps  11   
ep reward -1.00   steps  37   
ep reward -1.00   steps  27   
ep reward -1.00   steps  34   
ep reward -1.00   steps  28   
ep rewar

ep reward -1.00   steps  20   
ep reward -1.00   steps  33   
ep reward -1.00   steps  20   
ep reward -1.00   steps  34   
ep reward -1.00   steps  47   
ep reward -1.00   steps  37   
ep reward -1.00   steps  28   
ep reward -1.00   steps  22   
ep reward -1.00   steps  24   
ep reward -1.00   steps  23   
ep reward -1.00   steps  35   
ep reward -1.00   steps  41   
ep reward -1.00   steps  34   
ep reward -1.00   steps  53   
ep reward -1.00   steps  43   
ep reward -1.00   steps  51   
ep reward -1.00   steps  47   
ep reward -1.00   steps  38   
ep reward -1.00   steps  33   
ep reward -1.00   steps  22   
ep reward -1.00   steps  41   
ep reward -1.00   steps  37   
ep reward -1.00   steps  38   
ep reward -1.00   steps  38   
ep reward -1.00   steps  30   
ep reward -1.00   steps  51   
ep reward -1.00   steps  29   
ep reward -1.00   steps  38   
ep reward -1.00   steps  44   
ep reward -1.00   steps  44   
ep reward -1.00   steps  22   
ep reward -1.00   steps  26   
ep rewar

ep reward -1.00   steps  18   
ep reward -1.00   steps  34   
ep reward -1.00   steps  31   
ep reward -1.00   steps  22   
ep reward -1.00   steps  55   
ep reward -1.00   steps  31   
ep reward -1.00   steps  31   
ep reward -1.00   steps  12   
ep reward -1.00   steps  70   
ep reward -1.00   steps  29   
ep reward -1.00   steps  37   
ep reward -1.00   steps  44   
ep reward -1.00   steps  23   
ep reward -1.00   steps  26   
ep reward -1.00   steps  36   
ep reward -1.00   steps  32   
ep reward -1.00   steps  39   
ep reward -1.00   steps  20   
ep reward -1.00   steps  39   
ep reward -1.00   steps  22   
ep reward -1.00   steps  20   
ep reward -1.00   steps  27   
ep reward -1.00   steps  44   
ep reward -1.00   steps  24   
ep reward -1.00   steps  47   
ep reward -1.00   steps  36   
ep reward -1.00   steps  13   
ep reward -1.00   steps  23   
ep reward -1.00   steps  28   
ep reward -1.00   steps  54   
ep reward -1.00   steps  50   
ep reward -1.00   steps  42   
ep rewar

ep reward -1.00   steps  27   
ep reward -1.00   steps  45   
ep reward -1.00   steps  36   
ep reward -1.00   steps  23   
ep reward -1.00   steps  30   
ep reward -1.00   steps  31   
ep reward -1.00   steps  43   
ep reward -1.00   steps  42   
ep reward -1.00   steps  31   
ep reward -1.00   steps  22   
ep reward -1.00   steps  27   
ep reward -1.00   steps  49   
ep reward -1.00   steps  38   
ep reward -1.00   steps  35   
ep reward -1.00   steps  17   
ep reward -1.00   steps  18   
ep reward -1.00   steps  27   
ep reward -1.00   steps  16   
ep reward -1.00   steps  34   
ep reward -1.00   steps  23   
ep reward -1.00   steps  32   
ep reward -1.00   steps  29   
ep reward -1.00   steps  46   
ep reward -1.00   steps  30   
ep reward -1.00   steps  33   
ep reward -1.00   steps  29   
ep reward -1.00   steps  53   
ep reward -1.00   steps  50   
ep reward -1.00   steps  41   
ep reward -1.00   steps  19   
ep reward -1.00   steps  29   
ep reward -1.00   steps  54   
ep rewar

ep reward -1.00   steps  43   
ep reward -1.00   steps  29   
ep reward -1.00   steps  32   
ep reward -1.00   steps  37   
ep reward -1.00   steps  86   
ep reward -1.00   steps  27   
ep reward -1.00   steps  50   
ep reward -1.00   steps  37   
ep reward -1.00   steps  51   
ep reward -1.00   steps  43   
ep reward -1.00   steps  18   
ep reward -1.00   steps  15   
ep reward -1.00   steps  19   
ep reward -1.00   steps  38   
ep reward -1.00   steps  13   
ep reward -1.00   steps  21   
ep reward -1.00   steps  32   
ep reward -1.00   steps  22   
ep reward -1.00   steps  33   
ep reward -1.00   steps  13   
ep reward -1.00   steps  30   
ep reward -1.00   steps  29   
ep reward -1.00   steps  28   
ep reward -1.00   steps  40   
ep reward -1.00   steps  17   
ep reward -1.00   steps  38   
ep reward -1.00   steps  17   
ep reward -1.00   steps  24   
ep reward -1.00   steps  39   
ep reward -1.00   steps  18   
ep reward -1.00   steps  41   
ep reward -1.00   steps  39   
ep rewar

ep reward -1.00   steps  42   
ep reward -1.00   steps  50   
ep reward -1.00   steps  26   
ep reward -1.00   steps  16   
ep reward -1.00   steps  46   
ep reward -1.00   steps  39   
ep reward -1.00   steps  18   
ep reward -1.00   steps  31   
ep reward -1.00   steps  57   
ep reward -1.00   steps  16   
ep reward -1.00   steps  46   
ep reward -1.00   steps  31   
ep reward -1.00   steps  60   
ep reward -1.00   steps  13   
ep reward -1.00   steps  42   
ep reward -1.00   steps  33   
ep reward -1.00   steps  33   
ep reward -1.00   steps  32   
ep reward -1.00   steps  37   
ep reward -1.00   steps  22   
ep reward -1.00   steps  17   
ep reward -1.00   steps  31   
ep reward -1.00   steps  21   
ep reward -1.00   steps  21   
ep reward -1.00   steps  20   
ep reward -1.00   steps  19   
ep reward -1.00   steps  47   
ep reward -1.00   steps  61   
ep reward -1.00   steps  36   
ep reward -1.00   steps  56   
ep reward -1.00   steps  33   
ep reward -1.00   steps  33   
ep rewar

ep reward -1.00   steps  17   
ep reward -1.00   steps  11   
ep reward -1.00   steps  24   
ep reward -1.00   steps   6   
ep reward -1.00   steps  10   
ep reward -1.00   steps  69   
ep reward -1.00   steps  41   
ep reward -1.00   steps  36   
ep reward -1.00   steps  30   
ep reward -1.00   steps  16   
ep reward -1.00   steps  37   
ep reward -1.00   steps  35   
ep reward -1.00   steps  57   
ep reward -1.00   steps  61   
ep reward -1.00   steps  27   
ep reward -1.00   steps  70   
ep reward -1.00   steps  47   
ep reward -1.00   steps  23   
ep reward -1.00   steps  22   
ep reward -1.00   steps  32   
ep reward -1.00   steps  61   
ep reward -1.00   steps  26   
ep reward -1.00   steps  52   
ep reward -1.00   steps  46   
ep reward -1.00   steps  27   
ep reward -1.00   steps  20   
ep reward -1.00   steps  33   
ep reward -1.00   steps  31   
ep reward -1.00   steps  55   
ep reward -1.00   steps  23   
ep reward -1.00   steps  31   
ep reward -1.00   steps  43   
ep rewar

RuntimeError: invalid multinomial distribution (encountering probability entry < 0)

# Save model to checkpoint!

In [175]:
# klonstate = random_solved_endgame(20)
# print(to_pretty_string(klonstate))
# print()

# done = False
# visited = set()
# for i in range(500):
#     visited.add(klonstate)
#     action = select_action(klonstate)
# #     print(f'iteration {i}  action {all_moves[action]}')
#     klonstate, reward = step(klonstate, action)
#     done = reward > 0
#     if done:
#         break

# print()
# if done:
#     print("Solved!")
# else:
#     print(f'Not solved after {i+1} iterations')
    
# print(to_pretty_string(klonstate))

Stock: 
Waste: KD QS
Fnd C: AC 2C 3C 4C 5C 6C 7C 8C
Fnd D: AD 2D 3D 4D 5D 6D 7D 8D
Fnd S: AS 2S 3S 4S 5S 6S 7S 8S
Fnd H: AH 2H 3H 4H 5H 6H 7H 8H
Tab 1: 
Tab 2: KH QC JD TC 9H
Tab 3: KS QD JS TH 9S
Tab 4: JH TS 9D
Tab 5: 
Tab 6: KC QH JC TD 9C
Tab 7: 



RuntimeError: invalid multinomial distribution (encountering probability entry < 0)

In [201]:
policy(torch_state_vec)

tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan

In [178]:
state_vec = state_to_vec(klonstate)
movefilter = vector_legal_moves(klonstate)
torch_state_vec = torch.from_numpy(state_vec).float().reshape(-1).unsqueeze(0)
torch_filter = torch.from_numpy(movefilter.astype(np.float32)).unsqueeze(0)
probs = policy(torch_state_vec) * torch_filter
if (probs == 0).all():
    torch_filter.requires_grad_()
    # sample all legal moves with uniform probability
    m = Categorical(torch_filter)
else:
    # :attr:`probs` will be normalized to sum to 1
    m = Categorical(probs)
action = m.sample()
log_prob = m.log_prob(action)

RuntimeError: invalid multinomial distribution (encountering probability entry < 0)