Based on

- https://github.com/pytorch/examples/blob/87d9a1e930b5b813/reinforcement_learning/reinforce.py
- https://medium.com/@ts1829/policy-gradient-reinforcement-learning-in-pytorch-df1383ea0baf [Notebook](https://nbviewer.jupyter.org/urls/gist.githubusercontent.com/ts1829/ebbe2cf946bf36951b724818c52e36b9/raw/4da449bffe9835e201f2fb34f381fbb53568d1ca/Policy%20Gradient%20with%20Cartpole%20and%20PyTorch%20%28Medium%20Version%29.ipynb)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import count
from collections import namedtuple
from tqdm import tqdm, trange
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline

from tuplestate import *
from get_legal_moves import *
from benchmarking import convert_shootme_to_solvitaire_json


def listdir(path):
    ls = os.listdir(path)
    return [os.path.join(path, f) for f in ls]

solved = listdir('./fixtures/shootme/solved/')
solvedmin = listdir('./fixtures/shootme/solvedmin/')
fixtures = solved + solvedmin

def filename_to_klonstate(fname):
    with open(fname) as f:
        solv = convert_shootme_to_solvitaire_json(f.read())
        state = init_from_solvitaire(solv)
    return state

klonstate = filename_to_klonstate(np.random.choice(fixtures))

In [2]:
Args = namedtuple('Args', 'gamma lr seed render log_interval')
args = Args(
    gamma=0.01,
    lr=5e-2,
    seed=1,
    render=True,
    log_interval=20
)

In [3]:
klonstate

KlonState(stock=('TS', 'AC', '8D', '7D', '6H', 'KH', '8C', '7S', 'KD', 'AS', '7H', '4C', '6C', '5H', '5C', 'KS', '9H', 'JH', '5D', '7C', 'JD', 'KC', '3S', '9S'), tableau1=('4H',), tableau2=('th', 'AH'), tableau3=('4s', '8s', '5S'), tableau4=('qd', '4d', 'qh', '9D'), tableau5=('ad', 'jc', 'td', 'tc', '6D'), tableau6=('qc', '8h', 'js', '9c', '6s', 'QS'), tableau7=('2h', '3d', '2c', '2s', '2d', '3h', '3C'), waste=(), foundation1=(), foundation2=(), foundation3=(), foundation4=())

In [4]:
# sum([len(klonstate[f]) for f in FNDS])

In [5]:
class KlonEnvironment:
    
    def __init__(self):
        self.seed(2019)
        self.state = None
        self.vec_state = None
        self.visited = set()
    
    def seed(self, seed):
        """
        initialize random environment seed
        """
        self.rand = np.random.RandomState(seed)
        
    def __repr__(self):
        return f"<KlonEnvironment (visited={len(self.visited)})>"
    
    @property
    def spec(self):
        Spec = namedtuple('spec', 'reward_threshold')
        return Spec(
            reward_threshold=100
        )
  
    def reset_state(self, state):
        self.state = state
        self.vec_state = state_to_vec(state)
        self.visited = set([ self.state ])
        
    def update_state(self, new_state):
        self.state = new_state
        self.vec_state = state_to_vec(new_state)
        self.visited.add(new_state)
      
    def reset(self):
        """
        sets the environment fresh and random
        returns:
            state
        """
        fixture_file = self.rand.choice(fixtures)
        state = filename_to_klonstate(fixture_file)
        self.reset_state(state)
        return self.vec_state
    
    def step(self, action):
        """
        take the action
        returns:
            state: new state
            reward: reward for taking action
            done: is done?
            info: ...? (None)
        """
        info = None
        done = False
        move_code = np_all_moves[action]
        new_state = play_move(self.state, move_code)
        is_win = state_is_win(new_state)
        if new_state in self.visited:
            done = True
            reward = -100
            info = 'state visited'
        elif is_win:
            done = True
            reward = 100
            info = 'win!'
        else: # non-terminal
            # reward is the total num of cards in a foundation
            reward = sum([len(new_state[f]) for f in FNDS])
        self.update_state(new_state)
        return self.vec_state, reward, done, info
    
    def render(self):
        print(to_dict(self.state))

In [6]:
torch.manual_seed(args.seed)
env = KlonEnvironment()
env.seed(args.seed)

In features:
- game state (vector of size 233)

Out features:
- legal moves (vector of size 623)

We will use a simple feed forward neural network with one hidden layer of 128 neurons and a dropout of 0.6. We'll use Adam as our optimizer and a learning rate of 0.01. Using dropout will significantly improve the performance of our policy.

In [7]:
IN = 233
OUT = 623

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(IN, 500)
        self.linear1 = nn.Linear(500, 800)
        self.dropout = nn.Dropout(p=0.4)
        self.affine2 = nn.Linear(800, OUT)
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.affine1(x)
        x = F.relu(x)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.dropout(x)
        action_scores = self.affine2(x)
        return F.softmax(action_scores, dim=1)

In [8]:
# state = env.vec_state
# # do pytorch stuff to get state probabilities
# torchstate = torch.from_numpy(state).float().unsqueeze(0)
# # get probabilities from the last layer of the policy
# probs = policy(torchstate)    
# # get the vector of legal moves for this particular state
# vec = vector_legal_moves(env.state)
# # reshape to be a column vector (1x623) instead of 1-D (623,)
# nv = vec.reshape(1, -1)
# # legal moves are 1, others are 0
# # multiply to make illegal moves 0 prob
# msk_probs = (probs * torch.Tensor(nv))

In [9]:
# # get the vectorized state from the environment
# env.reset()
# state = env.vec_state
# # do pytorch stuff to get state probabilities
# torchstate = torch.from_numpy(state).float().unsqueeze(0)
# # get probabilities from the last layer of the policy
# probs = policy(torchstate)    
# # get the vector of legal moves for this particular state
# vec = vector_legal_moves(env.state)
# # reshape to be a column vector (1x623) instead of 1-D (623,)
# nv = vec.reshape(1, -1)

# if torch.isnan(probs).all(): # this happens! cant sample from Categorical nans
#     # sample random legal move
#     print("all nans choose random")
#     m = Categorical(torch.Tensor(nv))
# else:
#     # legal moves are 1, others are 0
#     # multiply to make illegal moves 0 prob
#     msk_probs = (probs * torch.Tensor(nv))
#     # still 623 probabilities but most are 0
#     # did not seem to require normalizing probabilities to 1
#     print('got a real prob')
#     m = Categorical(msk_probs)

# action = m.sample()
# 1/0

In [10]:
# msk_probs = (probs * torch.Tensor(nv))
# # still 623 probabilities but most are 0
# # did not seem to require normalizing probabilities to 1
# m = Categorical(msk_probs)
# action = m.sample()
# m.log_prob(action)

In [11]:
# msk_probs.requires_grad

In [12]:
# tnv = torch.Tensor(nv)
# tnv.requires_grad_()
# m = Categorical(tnv)
# action = m.sample()
# m.log_prob(action)

In [13]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=args.lr)
eps = np.finfo(np.float32).eps.item()

def select_action(env):
    # get the vectorized state from the environment
    state = env.vec_state
    # do pytorch stuff to get state probabilities
    torchstate = torch.from_numpy(state).float().unsqueeze(0)
    # get probabilities from the last layer of the policy
    probs = policy(torchstate)    
    # get the vector of legal moves for this particular state
    vec = vector_legal_moves(env.state)
    # reshape to be a column vector (1x623) instead of 1-D (623,)
    tnv = torch.Tensor(vec.reshape(1, -1))
    
    if torch.isnan(probs).all(): # this happens! cant sample from Categorical nans
        # sample random legal move
#         print("all nans choose random")
        tnv.requires_grad_() # so m.log_prob(action) is diffable
        m = Categorical(tnv)
        action = m.sample()
    else:
        # legal moves are 1, others are 0
        # multiply to make illegal moves 0 prob
        msk_probs = (probs * tnv)
        # still 623 probabilities but most are 0
        # did not seem to require normalizing probabilities to 1
#         print('got a real prob')
        m = Categorical(msk_probs)
        action = m.sample()
    
    # DEBUG
#     print('selecting', np_all_moves[action.tolist()[0]])
    # end debug
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()

def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + args.gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]

def main():
    running_reward = 0
    for i_episode in count(1):
        state, ep_reward = env.reset(), 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(env)
            state, reward, done, info = env.step(action)
#             if info:
#                 print(info)
#             if args.render:
#                 env.render()
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
#         for n,p in policy.named_parameters():
#             print(n, p.mean(), p.var())
#         print()
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

main()

RuntimeError: invalid multinomial distribution (encountering probability entry < 0)

In [14]:
args = Args(
    gamma=0.01,
    lr=3e-2,
    seed=1,
    render=True,
    log_interval=20
)

policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=args.lr)
eps = np.finfo(np.float32).eps.item()

In [96]:
state, ep_reward = env.reset(), 0
print(to_dict(vec_to_state(state)))

for t in range(1, 10000):  # Don't infinite loop while learning
    #     action = select_action(env)
    state = env.vec_state
    torchstate = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(torchstate)    
    vec = vector_legal_moves(env.state)
    tnv = torch.Tensor(vec.reshape(1, -1))
    
    if torch.isnan(probs).all():
        tnv.requires_grad_() # so m.log_prob(action) is diffable
        print('all isnan')
        m = Categorical(tnv)
        action = m.sample()
    else:
        msk_probs = (probs * tnv)
        print('as usual')
        m = Categorical(msk_probs)
        action = m.sample()
    print('action', action)

    policy.saved_log_probs.append(m.log_prob(action))
    action = action.item()

    state, reward, done, info = env.step(action)
    policy.rewards.append(reward)
    ep_reward += reward
    if done:
        break

R = 0
policy_loss = []
returns = []
for r in policy.rewards[::-1]:
    R = r + args.gamma * R
    returns.insert(0, R)

print('returns', returns)
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + eps)
print('returns scaled', returns)
for log_prob, R in zip(policy.saved_log_probs, returns):
    policy_loss.append(-log_prob * R)
print('policy loss', policy_loss)
optimizer.zero_grad()
policy_loss = torch.cat(policy_loss).sum()
print('policy loss sum', policy_loss)
policy_loss.backward()
optimizer.step()
del policy.rewards[:]
del policy.saved_log_probs[:]

for n,p in policy.named_parameters():
    print(n, p.mean(), p.var())
print()

{'stock': ['QC', '5C', 'AS', 'JS', 'KS', '9D', '5D', '3S', '6C', '8H', '6D', '9C', 'TD', 'KH', '7H', 'JH', 'QS', '8D', '3C', 'QD', '4S', '5S', '7S', 'AH'], 'waste': [], 'tableau': [['3H'], ['9h', 'JC'], ['7d', '4c', '4D'], ['2d', 'ad', 'tc', '2C'], ['kc', '6h', '2h', 'ts', 'KD'], ['ac', '5h', '3d', '9s', '7c', '2S'], ['4h', '8s', 'jd', 'qh', 'th', '8c', '6S']], 'foundation': [[], [], [], []]}
as usual
action tensor([616])
as usual
action tensor([616])
as usual
action tensor([616])
returns [-0.01, -1.0, -100.0]
returns scaled tensor([ 0.5859,  0.5687, -1.1547])
policy loss [tensor([6.9850e-08], grad_fn=<MulBackward0>), tensor([6.7796e-08], grad_fn=<MulBackward0>), tensor([-1.3765e-07], grad_fn=<MulBackward0>)]
policy loss sum tensor(0., grad_fn=<SumBackward0>)
affine1.weight tensor(0.0012, grad_fn=<MeanBackward0>) tensor(0.0077, grad_fn=<VarBackward0>)
affine1.bias tensor(0.0036, grad_fn=<MeanBackward0>) tensor(0.0224, grad_fn=<VarBackward0>)
linear1.weight tensor(9.4251e-05, grad_fn=<M