In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import torchvision
import torchvision.transforms as transforms
import numpy as np
from Simulation import Level

# L1 and L2 are simple and complex examples we will use to test policy
# learning
L1 = ["11111",
     "11011",
     "11111",
     "11111",
     "11X11"]

L2 = ["00011",
     "00011",
     "01011",
     "01111",
     "11011",
     "10000",
     "00100",
     "11011",
     "10111",
     "00X11"]

simple_level = Level(L1)
complex_level = Level(L2)

In [2]:
print(simple_level)
print(complex_level)

11111
11011
11111
11111
11X11

00011
00011
01011
01111
11011
10000
00100
11011
10111
00X11



In [3]:
# Hyperparameters
agent_view = 7*7
agent_choices = 8
learning_rate = 0.01
gamma = 0.99
hidden_size = 128
dropout_prob = 0.0

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = agent_view # Input vector
        self.action_space = agent_choices # Number of choices
        
        # Neural Net architecture
        self.l1 = nn.Linear(self.state_space, hidden_size, bias=None)
        self.l2 = nn.Linear(hidden_size, hidden_size, bias=None)
        self.l3 = nn.Linear(hidden_size, self.action_space, bias=None)
        
        self.gamma = gamma
        
        # Episode policy and reward history
        self.policy_history = Variable(torch.Tensor())
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []
    
    def forward(self, x):
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=dropout_prob),
            nn.ReLU(),
            self.l2,
            nn.Dropout(p=dropout_prob),
            nn.ReLU(),
            self.l3,
            nn.Softmax(dim=-1)
        )
        return model(x)
        

In [4]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [5]:
# Implement select_action here
def select_action(state):
    state = torch.from_numpy(state).type(torch.FloatTensor)
    choices = policy(Variable(state))
    c = Categorical(choices)
    action = c.sample()
    
    if policy.policy_history.nelement() == 0:
        policy.policy_history = torch.stack([c.log_prob(action)])
    else:
        policy.policy_history = torch.cat([policy.policy_history, torch.stack([c.log_prob(action)])])
    
    '''
    if policy.policy_history.size()[0] > 1:
        policy.policy_history = torch.cat([policy.policy_history, c.log_prob(action)])
    elif policy.policy_history.nelement() != 0:
        policy.policy_history = torch.stack([policy.policy_history, c.log_prob(action)])
    else:
        policy.policy_history = (c.log_prob(action))
    '''
    return action
        

In [6]:
# We apply Monte-Carlo Policy Gradient to improve out policy according
# to the equation
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float64).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.data[0])
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

In [7]:
def reward_func(x):
    return 1/(x+1)

level = simple_level
max_reward = 1

def main(episodes):
    for episode in range(episodes):
        done = False     
        level.Reset()
        while not done:
            state = np.asarray(level.getVector())
            action = select_action(state)
            print(action.item())
            x,done = level.Act(action.item())
            reward = reward_func(x)
            policy.reward_episode.append(reward)
            if reward == 1:
                print("Reached the end!")
            if done:
                break  
        update_policy()
        print("Episode Done!")

In [8]:
main(1000)

4
5
3
6
4




Episode Done!
2
3
Episode Done!
3
Episode Done!


RuntimeError: invalid argument 2: invalid multinomial distribution (encountering probability entry < 0) at c:\programdata\miniconda3\conda-bld\pytorch-cpu_1524541161962\work\aten\src\th\generic/THTensorRandom.cpp:326