In [1]:
import gym
import random
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from tensorboardX import SummaryWriter
from collections import namedtuple, deque
from torch.distributions import Categorical

reference: https://github.com/g6ling/Reinforcement-Learning-Pytorch-Cartpole

Memory/Replay Buffer Class

In [2]:
Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))

class Memory(object):
    def __init__(self):
        self.memory = deque()

    def push(self, state, next_state, action, reward, mask):
        self.memory.append(Transition(state, next_state, action, reward, mask))

    def sample(self):
        memory = self.memory
        return Transition(*zip(*memory)) 

    def __len__(self):
        return len(self.memory)


Utility Functions

In [3]:
def flat_grad(grads):
    grad_flatten = []
    for grad in grads:
        grad_flatten.append(grad.view(-1))
    grad_flatten = torch.cat(grad_flatten)
    return grad_flatten

def flat_hessian(hessians):
    hessians_flatten = []
    for hessian in hessians:
        hessians_flatten.append(hessian.contiguous().view(-1))
    hessians_flatten = torch.cat(hessians_flatten).data
    return hessians_flatten

def flat_params(model):
    params = []
    for param in model.parameters():
        params.append(param.data.view(-1))
    params_flatten = torch.cat(params)
    return params_flatten

def update_model(model, new_params):
    index = 0
    for params in model.parameters():
        params_length = len(params.view(-1))
        new_param = new_params[index: index + params_length]
        new_param = new_param.view(params.size())
        params.data.copy_(new_param)
        index += params_length

def kl_divergence(policy, old_policy):
    kl = old_policy * torch.log(old_policy / policy)

    kl = kl.sum(1, keepdim=True)
    return kl

def fisher_vector_product(net, states, p, cg_damp=0.1):
    policy = net(states)
    old_policy = net(states).detach()
    kl = kl_divergence(policy, old_policy)
    kl = kl.mean()
    kl_grad = torch.autograd.grad(kl, net.parameters(), create_graph=True) # create_graph is True if we need higher order derivative products
    kl_grad = flat_grad(kl_grad)

    kl_grad_p = (kl_grad * p.detach()).sum()
    kl_hessian_p = torch.autograd.grad(kl_grad_p, net.parameters())
    kl_hessian_p = flat_hessian(kl_hessian_p)

    return kl_hessian_p + cg_damp * p.detach()


def conjugate_gradient(net, states, loss_grad, n_step=10, residual_tol=1e-10):
    x = torch.zeros(loss_grad.size())
    r = loss_grad.clone()
    p = loss_grad.clone()
    r_dot_r = torch.dot(r, r)

    for i in range(n_step):
        A_dot_p = fisher_vector_product(net, states, p)
        alpha = r_dot_r / torch.dot(p, A_dot_p)
        x += alpha * p
        r -= alpha * A_dot_p
        new_r_dot_r = torch.dot(r,r)
        betta = new_r_dot_r / r_dot_r
        p = r + betta * p
        r_dot_r = new_r_dot_r
        if r_dot_r < residual_tol:
            break
    return x

Agent

In [4]:
class TRPO(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(TRPO, self).__init__()
        self.t = 0
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs

        self.fc_1 = nn.Linear(num_inputs, 128)
        self.fc_2 = nn.Linear(128, 64)
        self.fc_3 = nn.Linear(64, num_outputs)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform(m.weight)

    def forward(self, input):
        x = torch.relu(self.fc_1(input))
        x = torch.relu(self.fc_2(x))
        policy = F.softmax(self.fc_3(x),dim=-1)

        return policy

    @classmethod
    def train_model(cls, net, transitions):
        states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask

        states = torch.stack(states)
        actions = torch.stack(actions)
        rewards = torch.Tensor(rewards)
        masks = torch.Tensor(masks)

        returns = torch.zeros_like(rewards)

        running_return = 0
        for t in reversed(range(len(rewards))):
            running_return = rewards[t] + gamma * running_return * masks[t]
            returns[t] = running_return

        policy = net(states)
        policy = policy.view(-1, net.num_outputs)
        policy_action = (policy * actions.detach()).sum(dim=1)

        old_policy = net(states).detach()
        old_policy = old_policy.view(-1, net.num_outputs)
        old_policy_action = (old_policy * actions.detach()).sum(dim=1)

        surrogate_loss = ((policy_action / old_policy_action) * returns).mean()

        surrogate_loss_grad = torch.autograd.grad(surrogate_loss, net.parameters())
        surrogate_loss_grad = flat_grad(surrogate_loss_grad)

        step_dir = conjugate_gradient(net, states, surrogate_loss_grad.data)

        params = flat_params(net)
        shs = (step_dir * fisher_vector_product(net, states, step_dir)).sum(0, keepdim=True)
        step_size = torch.sqrt((2 * max_kl) / shs)[0]
        full_step = step_size * step_dir

        fraction = 1.0
        for _ in range(10):
            new_params = params + fraction * full_step
            update_model(net, new_params)
            policy = net(states)
            policy = policy.view(-1, net.num_outputs)
            policy_action = (policy * actions.detach()).sum(dim=1)
            surrogate_loss = ((policy_action / old_policy_action) * returns).mean()

            kl = kl_divergence(policy, old_policy)
            kl = kl.mean()

            if kl < max_kl:
                break
            fraction = fraction * 0.5

        return -surrogate_loss

    def get_action(self, input):
        try:
            policy = Categorical(self.forward(input))
#         policy = policy[0].data.numpy()
        
        except:
            print("Nan occured, terminating")
            return -1
        
        action = policy.sample()
        
        return action


# CartPole

In [8]:
env_name = 'CartPole-v1'
gamma = 0.99
goal_score = 200
log_interval = 10
max_kl = 0.001

In [9]:
env = gym.make(env_name)
env.seed(500)
torch.manual_seed(500)

num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.n
print('state size:', num_inputs)
print('action size:', num_actions)

net = TRPO(num_inputs, num_actions)
writer = SummaryWriter('logs')


running_score = 0
steps = 0
loss = 0
for e in range(30000):
    done = False
    memory = Memory()

    score = 0
    state = env.reset()
    state = torch.Tensor(state)
    state = state.unsqueeze(0)

    while not done:
        steps += 1

        action = net.get_action(state)
        next_state, reward, done, _ = env.step(action)

        next_state = torch.Tensor(next_state)
        next_state = next_state.unsqueeze(0)

        mask = 0 if done else 1
        reward = reward if not done or score == 499 else -1

        action_one_hot = torch.zeros(2)
        action_one_hot[action] = 1
        memory.push(state, next_state, action_one_hot, reward, mask)

        score += reward
        state = next_state

    loss = TRPO.train_model(net, memory.sample())

    score = score if score == 500.0 else score + 1
    running_score = 0.99 * running_score + 0.01 * score
    if e % log_interval == 0:
        print('{} episode | score: {:.2f}'.format(e, running_score))
        print("Average steps per episode:", steps/log_interval)
        steps = 0
        writer.add_scalar('log/score', float(running_score), e)
        writer.add_scalar('log/loss', float(loss), e)

    if running_score > goal_score:
        break

state size: 4
action size: 2
0 episode | score: 0.20


  nn.init.xavier_uniform(m.weight)
  policy = F.softmax(self.fc_2(x))


10 episode | score: 2.93
20 episode | score: 5.46
30 episode | score: 6.93
40 episode | score: 8.54
50 episode | score: 10.45
60 episode | score: 13.11
70 episode | score: 16.10
80 episode | score: 18.57
90 episode | score: 21.83
100 episode | score: 25.27
110 episode | score: 28.13
120 episode | score: 28.03
130 episode | score: 28.41
140 episode | score: 29.44
150 episode | score: 32.23
160 episode | score: 34.21
170 episode | score: 35.29
180 episode | score: 37.53
190 episode | score: 40.59
200 episode | score: 43.23
210 episode | score: 46.44
220 episode | score: 49.07
230 episode | score: 50.52
240 episode | score: 51.84
250 episode | score: 53.11
260 episode | score: 55.19
270 episode | score: 55.46
280 episode | score: 55.81
290 episode | score: 57.23
300 episode | score: 57.25
310 episode | score: 58.80
320 episode | score: 60.02
330 episode | score: 59.54
340 episode | score: 58.80
350 episode | score: 60.66
360 episode | score: 60.49
370 episode | score: 61.65
380 episode | 

# Machine GMM 

In [5]:
from machine import Machine
from GymMachEnv import MachineEnv

In [6]:
gamma = 0.95
log_interval = 10
max_kl = 0.001

In [7]:
machine = Machine()
machine.curr_state = 0
env = MachineEnv(machine)

num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.n



In [8]:
net = TRPO(num_inputs, num_actions)

writer = SummaryWriter('./logs/TRPO_5')

best_score = 0
running_score = 0
Total_score = 0
steps = 0
loss = 0
maintenance_count = 0
for e in range(2000):
    done = False
    memory = Memory()

    score = 0
    state = env.reset()
    state = torch.Tensor(state)
    state = state.unsqueeze(0)

    while not done:
        steps += 1

        action = net.get_action(state)
        
        if action == -1: break
            
        if action == 1: maintenance_count+=1
        
        next_state, reward, done, _ = env.step(action)

        next_state = torch.Tensor(next_state)
        next_state = next_state.unsqueeze(0)

        mask = 0 if done else 1

        action_one_hot = torch.zeros(2)
        action_one_hot[action] = 1
        memory.push(state, next_state, action_one_hot, reward, mask)

        score += reward
        state = next_state
    
    if action == -1: break
        
    loss = TRPO.train_model(net, memory.sample())

    running_score = 0.99 * running_score + 0.01 * score
    Total_score+=score
    
    if e % log_interval == 0:
        print('{} episode | Average score: {:.2f} | Average steps per episode: {}| Average number of maintenance: {}'.format(
            e, Total_score/log_interval,steps/log_interval, maintenance_count/log_interval))
        Total_score = 0
        maintenance_count=0
        steps = 0
        writer.add_scalar('log/Average_score', float(Total_score/log_interval), e)
        writer.add_scalar('log/loss', float(loss), e)
        
        if best_score < running_score:
            best_score = running_score
            torch.save(net,'TRPO_agent.pt')

  nn.init.xavier_uniform(m.weight)


0 episode | Average score: 2730.00 | Average steps per episode: 7.8| Average number of maintenance: 4.6
10 episode | Average score: 18600.00 | Average steps per episode: 48.9| Average number of maintenance: 27.8
20 episode | Average score: 18600.00 | Average steps per episode: 52.1| Average number of maintenance: 29.4
30 episode | Average score: 23490.00 | Average steps per episode: 48.2| Average number of maintenance: 20.9
40 episode | Average score: 24970.00 | Average steps per episode: 48.8| Average number of maintenance: 17.6
50 episode | Average score: 15620.00 | Average steps per episode: 33.3| Average number of maintenance: 14.3
60 episode | Average score: 28490.00 | Average steps per episode: 54.5| Average number of maintenance: 20.4
70 episode | Average score: 18580.00 | Average steps per episode: 35.8| Average number of maintenance: 12.9
80 episode | Average score: 28850.00 | Average steps per episode: 67.3| Average number of maintenance: 31.8
90 episode | Average score: 2690

760 episode | Average score: 13300.00 | Average steps per episode: 27.6| Average number of maintenance: 7.6
770 episode | Average score: 15830.00 | Average steps per episode: 30.6| Average number of maintenance: 7.7
780 episode | Average score: 29920.00 | Average steps per episode: 51.1| Average number of maintenance: 15.8
790 episode | Average score: 31740.00 | Average steps per episode: 59.2| Average number of maintenance: 21.2
800 episode | Average score: 16550.00 | Average steps per episode: 32.6| Average number of maintenance: 11.7
810 episode | Average score: 19580.00 | Average steps per episode: 39.5| Average number of maintenance: 15.5
820 episode | Average score: 30700.00 | Average steps per episode: 53.9| Average number of maintenance: 16.7
830 episode | Average score: 33550.00 | Average steps per episode: 59.0| Average number of maintenance: 19.4
840 episode | Average score: 18390.00 | Average steps per episode: 34.9| Average number of maintenance: 12.4
850 episode | Average

1510 episode | Average score: 24720.00 | Average steps per episode: 45.4| Average number of maintenance: 15.3
1520 episode | Average score: 26920.00 | Average steps per episode: 51.8| Average number of maintenance: 19.1
1530 episode | Average score: 20870.00 | Average steps per episode: 40.2| Average number of maintenance: 14.2
1540 episode | Average score: 42780.00 | Average steps per episode: 77.2| Average number of maintenance: 26.4
1550 episode | Average score: 14940.00 | Average steps per episode: 32.6| Average number of maintenance: 14.7
1560 episode | Average score: 21700.00 | Average steps per episode: 73.7| Average number of maintenance: 51.3
1570 episode | Average score: 11840.00 | Average steps per episode: 50.4| Average number of maintenance: 35.8
1580 episode | Average score: 13130.00 | Average steps per episode: 42.5| Average number of maintenance: 28.6
1590 episode | Average score: 16380.00 | Average steps per episode: 59.6| Average number of maintenance: 42.6
1600 episo

# Test Trained Agent

In [5]:
agent = torch.load('TRPO_agent.pt')
agent.eval()

TRPO(
  (fc_1): Linear(in_features=4, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=64, bias=True)
  (fc_3): Linear(in_features=64, out_features=2, bias=True)
)