In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import torchvision
import torchvision.transforms as transforms
import numpy as np
from Simulation import Level

# L1 and L2 are simple and complex examples we will use to test policy
# learning
L1 = ["11111",
     "11011",
     "11111",
     "11111",
     "11X11"]

L2 = ["00011",
     "00011",
     "01011",
     "01111",
     "11011",
     "10000",
     "00100",
     "11011",
     "10111",
     "00X11"]

simple_level = Level(L1)
complex_level = Level(L2)

In [2]:
print(simple_level)
print(complex_level)

11111
11011
11111
11111
11X11

00011
00011
01011
01111
11011
10000
00100
11011
10111
00X11



In [3]:
# Hyperparameters
agent_view = 7*7
agent_choices = 8
learning_rate = 0.01
gamma = 0.99
hidden_size = 128
dropout_prob = 0.6

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = agent_view # Input vector
        self.action_space = agent_choices # Number of choices
        
        # Neural Net architecture
        self.l1 = nn.Linear(self.state_space, hidden_size, bias=True)
        self.l2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.l3 = nn.Linear(hidden_size, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history
        self.policy_history = Variable(torch.Tensor())
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []
    
    def forward(self, x):
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=dropout_prob),
            nn.ReLU(),
            self.l2,
            nn.Dropout(p=dropout_prob),
            nn.ReLU(),
            self.l3,
            nn.Softmax(dim=-1)
        )
        return model(x)
        

In [4]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [5]:
# Implement select_action here
def select_action(state):
    state = torch.from_numpy(state).type(torch.FloatTensor)
    choices = policy(Variable(state))
    print(choices)
    c = Categorical(choices)
    action = c.sample()
    
    if policy.policy_history.nelement() == 0:
        policy.policy_history = torch.stack([c.log_prob(action)])
    else:
        policy.policy_history = torch.cat([policy.policy_history, torch.stack([c.log_prob(action)])])
    
    '''
    if policy.policy_history.size()[0] > 1:
        policy.policy_history = torch.cat([policy.policy_history, c.log_prob(action)])
    elif policy.policy_history.nelement() != 0:
        policy.policy_history = torch.stack([policy.policy_history, c.log_prob(action)])
    else:
        policy.policy_history = (c.log_prob(action))
    '''
    return action
        

In [6]:
# We apply Monte-Carlo Policy Gradient to improve out policy according
# to the equation
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float64).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.data[0])
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

In [7]:
def reward_func(x):
    return 1/(x+1)

level = simple_level
max_reward = 1

def main(episodes):
    for episode in range(episodes):
        done = False     
        level.Reset()
        while not done:
            state = np.asarray(level.getVector())
            action = select_action(state)
            print(action.item())
            x,done = level.Act(action.item())
            reward = reward_func(x)
            policy.reward_episode.append(reward)
            if reward == 1:
                print("Reached the end!")
            if done:
                break  
        update_policy()
        print("Episode Done!")

In [8]:
main(1000)

tensor([ 0.1095,  0.1244,  0.1419,  0.1255,  0.1244,  0.0895,  0.1170,
         0.1677])
6
tensor([ 0.1118,  0.1122,  0.1475,  0.1168,  0.1000,  0.1008,  0.1406,
         0.1703])
5
tensor([ 0.1556,  0.0991,  0.1259,  0.1039,  0.1090,  0.1519,  0.1276,
         0.1272])
0
tensor([ 0.0926,  0.1542,  0.1146,  0.1396,  0.1084,  0.1030,  0.1305,
         0.1572])
1
tensor([ 0.0969,  0.1838,  0.1639,  0.1249,  0.0709,  0.1205,  0.0913,
         0.1479])
0
tensor([ 0.1072,  0.1745,  0.1266,  0.0955,  0.1208,  0.1489,  0.1121,
         0.1143])
0
tensor([ 0.0921,  0.1537,  0.1109,  0.1144,  0.0989,  0.1392,  0.1390,
         0.1518])
3
tensor([ 0.1513,  0.1330,  0.1262,  0.1019,  0.0915,  0.1058,  0.1477,
         0.1427])
2
tensor([ 0.1564,  0.1125,  0.1163,  0.0993,  0.0976,  0.1279,  0.0871,
         0.2029])
6
tensor([ 0.0946,  0.1003,  0.1451,  0.1275,  0.0825,  0.1127,  0.0892,
         0.2481])
0
tensor([ 0.1409,  0.1125,  0.1301,  0.1035,  0.0814,  0.1293,  0.1595,
         0.1429])
3



Episode Done!
tensor([ 0.1182,  0.1374,  0.1094,  0.0970,  0.1214,  0.1581,  0.1552,
         0.1032])
2
tensor([ 0.1068,  0.1236,  0.1323,  0.0796,  0.1021,  0.0986,  0.2010,
         0.1560])
2
tensor([ 0.1374,  0.1217,  0.1316,  0.1233,  0.1140,  0.1221,  0.1316,
         0.1182])
6
Episode Done!
tensor([ 0.1013,  0.0836,  0.2067,  0.0429,  0.1296,  0.1030,  0.1356,
         0.1973])
5
tensor([ 0.1751,  0.0983,  0.1438,  0.0798,  0.0813,  0.1289,  0.1373,
         0.1555])
6
tensor([ 0.1616,  0.1550,  0.2114,  0.0489,  0.1505,  0.0856,  0.1223,
         0.0646])
0
tensor([ 0.1834,  0.1064,  0.1186,  0.0467,  0.0900,  0.1851,  0.1253,
         0.1446])
0
tensor([ 0.2807,  0.0979,  0.2032,  0.0688,  0.0705,  0.0832,  0.0788,
         0.1170])
0
Episode Done!
tensor([ 0.1344,  0.1552,  0.1346,  0.0446,  0.1430,  0.0993,  0.1994,
         0.0895])
0
tensor([ 0.0480,  0.2436,  0.1520,  0.0414,  0.1148,  0.0778,  0.1936,
         0.1287])
3
tensor([ 0.1430,  0.0970,  0.1798,  0.0880,  0.1

RuntimeError: invalid argument 2: invalid multinomial distribution (encountering probability entry < 0) at c:\programdata\miniconda3\conda-bld\pytorch-cpu_1524541161962\work\aten\src\th\generic/THTensorRandom.cpp:326