### [Reference](https://www.oreilly.com/ideas/reinforcement-learning-for-complex-goals-using-tensorflow)

In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from gridworld_goals import *

In [2]:
class ExperienceBuffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self, experience):
        if len(list(self.buffer)) + len(list(experience)) >= self.buffer_size:
            self.buffer[0:(len(list(experience))+len(list(self.buffer)))-self.buffer_size] = []
        self.buffer.extend(zip(experience))
            
    def sample(self,size):
        return np.reshape(np.array(random.sample(self.buffer, size)), [size, 5])
    
def get_f(m, offsets):
    f = np.zeros([len(m), m.shape[1], len(offsets)])
    for i, offset in enumerate(offsets):
        f[:-offset, :, i] = m[offset:, :] - m[:-offset, :]
        if i > 0:
            f[-offset:, :, i] = f[-offset:, :, i-1]
    return f

In [3]:
class DFP(nn.Module):
    def __init__(self, action_size, observation_shape, num_measurements, num_offsets):
        super(DFP, self).__init__()
        
        self.observation_shape = observation_shape
        self.num_measurements = num_measurements
        self.action_size = action_size
        self.num_goals = num_measurements
        self.num_offsets = num_offsets
        
        self.h_o = nn.Linear(int(np.prod(observation_shape)), 128)
        self.h_m = nn.Linear(num_measurements, 64)
        self.h_g = nn.Linear(num_measurements, 64)
        self.h = nn.Linear(128 + 64 + 64, 256)
        
        # Calculate separate expectations and advantage stream
        self.h_expectation = nn.Linear(256, action_size * self.num_offsets * self.num_measurements)
        self.h_advantages = nn.Linear(256, action_size * self.num_offsets * self.num_measurements)
        
        
    def forward(self, observation, measurement, goals, temp):
        observation_flatten = observation.view(observation.size()[0], -1)
        h_o = F.elu(self.h_o(observation_flatten))
        h_m = F.elu(self.h_m(measurement))
        h_g = F.elu(self.h_g(goals))
        
        h = torch.cat([h_o, h_m, h_g], dim=1)
        h_ = F.elu(self.h(h))
        
        expectations = self.h_expectation(h_)
        advantages = self.h_advantages(h_)
        advantages = advantages - advantages.mean(1).expand_as(advantages)
        
        predictions = expectations + advantages
        predictions = predictions.view(-1, self.num_measurements, self.action_size, self.num_offsets)
        
        boltzman = F.softmax(predictions.mean(3) / temp)
        boltzman = boltzman.squeeze(3)
        return boltzman, predictions
    
    def compute_loss(self, observation, measurement, goals, temp, action, target):
        boltzman, predictions = self(observation, measurement, goals, temp)
        pred_action = (predictions * action_onehot.view(-1, 1, self.action_size, 1)).sum(2)
        
        loss = nn.MSE(pred_action, target)
        entropy = -(boltzman * torch.log(boltzman + 1e-7)).sum()
        total_loss = loss + entropy
        
        # Backward and optimize step
        '''
        optimizer.zero_grad()
        total_loss.backward()
        torch.nn.utils.clip_grad_norm(self.parameters(), 1.0)
        optimizer.step()
        
        '''
        
        return loss, entropy

In [4]:
a_size = 4 # Number of available actions
num_measurements = 2
learning_rate = 1e-3 
num_episodes = 130
offsets = [1, 2, 4, 8, 16, 32] # Set of temporal offsets

In [124]:
class Trainer():
    def __init__(self):
        self.exp_buff = ExperienceBuffer()
        self.env = gameEnv(partial=False, size=5)
        s, o_big, m, g, h = self.env.reset()
        self.model = DFP(a_size, s.shape, num_measurements, len(offsets))

    def work(self):
        episode_deliveries = []
        episode_lengths = []

        for _ in range(num_episodes):
            episode_buffer = []
            episode_frames = []
            d = False
            t = 0
            temp = 0.25 #How spread out we want our action distribution to be

            s, o_big, m, g, h = self.env.reset()
            current_goal = None

            while d is False:
                if m[1] <= .3:
                    current_goal = np.array([0.0,1.0])
                else:
                    current_goal = np.array([1.0,0.0])

                # Convert to Variable
                m = np.array(m)
                s_var = Variable(torch.from_numpy(s)).float().unsqueeze(0)
                m_var = Variable(torch.from_numpy(m)).float().unsqueeze(0)
                g_var = Variable(torch.from_numpy(current_goal)).float().unsqueeze(0)

                # Compute action probabilities
                boltzman, _ = self.model(s_var, m_var, g_var, temp)
                b = current_goal * boltzman.data.numpy()[0].T
                c = np.sum(b,1)
                c /= c.sum()
                a = np.random.choice(c, p=c)
                a = np.argmax(c == a)

                # Add to episode buffer
                episode_buffer.append([s, a, m, current_goal, np.zeros(len(offsets))])

                # Perform the action on the environment
                s, s1_big, m, g, h, d = self.env.step(a) 
                t += 1

                # End the episode after 100 steps
                if t > 100:
                    d = True

            # Training statistics
            episode_deliveries.append(m[0])
            episode_lengths.append(t)

            # Update the network using experience buffer at the end
            # of the episode
            self.train(episode_buffer)    

    def train(self, rollout):
        rollout = np.array(rollout)
        measurements = np.vstack(rollout[:,2])
        targets = get_f(measurements, offsets)
        rollout[:,4] = zip(targets)
        self.exp_buff.add(rollout)
        
        print('check target shape', targets.shape)
        
        # Get a batch of experiences from the buffer and 
        # use them to update the global network
        if len(self.exp_buff.buffer) > 128:
            exp_batch = self.exp_buff.sample(128)

            observation_batch = np.stack(exp_batch[:, 0], axis=0)
            measurement_batch = np.vstack(exp_batch[:, 2])
            temperature = 0.1
            action_batch = exp_batch[:, 1]
            target_batch_zip = np.vstack(exp_batch[:, 4])
            target_batch = []
            
            for z in target_batch_zip:
                arr = np.array([*z[0]])
                target_batch.append(arr)
            
            target_batch = np.array(target_batch)
            print(target_batch[110].shape)
            
            #target_batch = np.vstack([*exp_batch[:, 4]])
            goal_batch = np.vstack(exp_batch[:, 3])

            # Convert to variables
            obs_var = Variable(torch.from_numpy(observation_batch)).float()
            mea_var = Variable(torch.from_numpy(measurement_batch)).float()
            goa_var = Variable(torch.from_numpy(goal_batch)).float()
            act_var = Variable(torch.from_numpy(action_batch.astype(np.int32))).float()
            tar_var = Variable(torch.from_numpy(target_batch.astype(np.float32))).float()
            
            loss, entropy = self.model.compute_loss(obs_var, mea_var,
                                                    goa_var, temperature, 
                                                    act_var, tar_var)
            return loss / len(rollout), entropy / len(rollout)
        else:
            return 0, 0

In [125]:
trainer = Trainer()
trainer.work()

check target shape (43, 2, 6)
check target shape (80, 2, 6)
check target shape (40, 2, 6)
(0,)


ValueError: setting an array element with a sequence.