In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import torchvision
import torchvision.transforms as transforms
import numpy as np
from Simulation import Level

# L1 and L2 are simple and complex examples we will use to test policy
# learning
L1 = ["11111",
     "11111",
     "11111",
     "11111",
     "11X11",
     "11111",
     "11111"]

L2 = ["00011",
     "00011",
     "01011",
     "01111",
     "11011",
     "10000",
     "00100",
     "11011",
     "10111",
     "11111",
     "11X11",
     "11111",
     "11111"]

simple_level = Level(L1)
complex_level = Level(L2)

In [2]:
print(simple_level)
print(complex_level)

11111
11111
11111
11111
11X11
11111
11111

00011
00011
01011
01111
11011
10000
00100
11011
10111
11111
11X11
11111
11111



In [3]:
# Hyperparameters
agent_view = 5*5*3
agent_choices = 8
learning_rate = 0.001
gamma = 0.99
hidden_size = 128
dropout_prob = 0.6

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = agent_view # Input vector
        self.action_space = agent_choices # Number of choices
        
        # Neural Net architecture
        self.l1 = nn.Linear(self.state_space, hidden_size, bias=True)
        self.l2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.l3 = nn.Linear(hidden_size, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history
        self.policy_history = Variable(torch.Tensor())
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []
    
    def forward(self, x):
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=dropout_prob),
            nn.SELU(),
            self.l2,
            nn.Dropout(p=dropout_prob),
            nn.SELU(),
            self.l3,
            nn.Softmax(dim=-1)
        )
        return model(x)
        

In [4]:
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [5]:
# Implement select_action here
def select_action(state):
    state = torch.from_numpy(state).type(torch.FloatTensor)
    choices = policy(Variable(state))
    c = Categorical(choices)
    
    action = c.sample()
        
    if policy.policy_history.nelement() == 0:
        policy.policy_history = torch.stack([c.log_prob(action)])
    else:
        policy.policy_history = torch.cat([policy.policy_history, torch.stack([c.log_prob(action)])])
    
    '''
    if policy.policy_history.size()[0] > 1:
        policy.policy_history = torch.cat([policy.policy_history, c.log_prob(action)])
    elif policy.policy_history.nelement() != 0:
        policy.policy_history = torch.stack([policy.policy_history, c.log_prob(action)])
    else:
        policy.policy_history = (c.log_prob(action))
    '''
    return action
        

In [6]:
# We apply Monte-Carlo Policy Gradient to improve out policy according
# to the equation
def update_policy():
    
    R = 0
    rewards = []

    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)

    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float64).eps)

    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))

    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    policy.loss_history.append(loss.data[0])
    
    #Save and intialize episode history counter
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode = []

In [7]:
def reward_func(x,steps,done):
    reward = -x
    if done:
        if(x <= 0):
            reward += 40
        else:
            reward -= 20
    #reward -= steps/4
    return reward

def rfunc2(x):
    return 1/x

level = complex_level
max_reward = 1

def main(episodes):
    for episode in range(episodes):
        done = False     
        level.Reset()
        while not done:
            state = np.asarray(level.getVector())
            action = select_action(state)
            print(action.item())
            x,steps,done = level.Act(action.item())
            reward = rfunc2(x)
            policy.reward_episode.append(reward)
            if x <= 0:
                print("Reached the end!", end=" ")
            if done:
                break 
        update_policy()
        print("Episode Done!")

In [None]:
main(1000)

7
2
7




Episode Done!
5
6
7
5
1
Episode Done!
7
6
3
Episode Done!
5
4
2
Episode Done!
7
6
1
2
4
5
6
7
0
1
6
Episode Done!
7
4
6
0
5
0
3
2
7
4
6
Episode Done!
7
5
1
Episode Done!
7
7
Episode Done!
6
0
4
7
7
6
Episode Done!
7
3
Episode Done!
0
7
7
Episode Done!
5
7
5
Episode Done!
0
7
0
5
6
3
2
4
4
2
5
Episode Done!
4
7
7
7
Episode Done!
7
0
0
2
7
7
Episode Done!
5
0
4
0
Episode Done!
0
5
4
6
Episode Done!
7
4
3
2
4
4
7
4
5
2
Episode Done!
0
4
Episode Done!
1
5
Episode Done!
3
7
Episode Done!
4
0
Episode Done!
0
0
3
5
0
7
4
4
Episode Done!
4
0
Episode Done!
0
0
3
6
0
0
4
Episode Done!
5
0
5
Episode Done!
0
3
0
4
Episode Done!
0
7
0
0
7
6
4
4
0
Episode Done!
7
0
0
0
0
5
3
5
Episode Done!
0
0
4
0
Episode Done!
0
4
Episode Done!
0
0
0
Episode Done!
0
0
3
7
0
0
0
7
0
0
0
Episode Done!
0
4
Episode Done!
0
0
0
Episode Done!
0
7
0
0
5
6
5
4
0
Episode Done!
0
0
7
0
0
3
0
0
Episode Done!
0
6
0
3
7
4
4
7
0
0
0
Episode Done!
0
0
0
Episode Done!
0
5
0
0
0
Episode Done!
0
0
0
Episode Done!
0
3
0
0
7
0
0
5
0


Episode Done!
0
0
3
5
0
0
3
0
3
0
3
0
0
Episode Done!
0
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
7
0
0
3
0
3
0
3
0
3
Episode Done!
0
0
3
0
3
0
3
0
3
0
3
7
0
0
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
Episode Done!
0
0
3
0
3
0
3
6
0
0
3
3
3
0
3
0
3
0
5
0
3
0
3
0
3
0
3
0
3
0
Episode Done!
0
0
3
0
3
3
0
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
Episode Done!
6
0
0
3
3
3
0
3
0
3
0
0
3
3
0
0
3
3
0
0
3
0
3
3
0
0
3
0
3
3
Episode Done!
3
0
0
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
7
0
0
3
0
3
Episode Done!
0
0
3
0
3
3
0
0
3
3
2
7
0
0
0
3
3
0
3
0
3
0
3
0
3
0
3
0
3
0
Episode Done!
3
0
3
0
0
0
3
0
3
0
3
0
3
0
3
0
7
0
0
3
3
0
0
3
0
3
3
0
0
3
Episode Done!
0
3
0
3
6
3
0
3
0
3
0
3
0
3
0
7
0
0
3
0
6
Episode Done!
3
0
0
7
0
3
0
0
3
3
0
0
3
3
0
0
3
0
3
3
0
0
3
0
6
3
3
0
0
3
Episode Done!
3
0
0
3
3
0
3
0
3
0
7
0
0
3
0
0
3
3
0
3
0
3
0
3
0
3
0
0
3
3
Episode Done!
7
0
0
3
0
5
3
0
6
3
0
3
0
3
0
3
0
0
3
0
3
3
0
3
0
0
3
0
3
3
Episode Done!
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
3
0
7
0
0
0
3
0
3
3
0
3
Episode Done!
3
