In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
from torch import nn

In [None]:
class Agent():
    def __init__(self, observation_dim, params = None, action_bounds = None):
        pass

    def __call__(self, obs):
        return self.act(obs)
    

# code for A3    
class DQN(Agent):
    def __init__(self, observation_dim, action_dim, gamma=0.9):   
        self.actions = action_dim  # 3
        self.obs_dim = observation_dim  # 6

        # Q(x,x|w)
        # q value is determined by s and a, so if input s, gives the values at each action

        # input (x, obs_dim), output (x, actions)
        self.q  = nn.Sequential(
            nn.Linear(self.obs_dim, 32),  # input (x, obs_dim), output: (x, 32)
            nn.ReLU(),                    # convert negs to 0
            nn.Linear(32,32),             # input (y, 32), output (y, 32) 
            nn.ReLU(),                  
            nn.Linear(32, self.actions)   # input (z, 32), output: (z, actions)
        ).double()                        # x = y=z

        # Q(x,x,w-)
        self.q_target  = nn.Sequential(
            nn.Linear(self.obs_dim, 32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.ReLU(),
            nn.Linear(32, self.actions)
        ).double()

        # we are doing gradient descent WRT self.optim corresponding q
        self.optim = torch.optim.Adam(self.q.parameters(), lr=1e-4)
        self.N_STEPS = 1000
        self.BATCH_SIZE = 512
        self.N_EPOCHS = 200
        self.gamma = gamma

    def compute_target(self, states, rewards):
        """
        states: torch.Tensor of size (batch, obs_dim) with s' from the dataset
        rewards: torch.Tensor of size (batch) with single step rewards (float)
        
        returns torch.Tensor of size (batch) with the 1-step Q learning target
        """
        # states:  batch_next_states (512, obs_dim)
        # rewards: batch_rewards (512, 1)
        # just input batch of 
        # s' to Q(s', a'; w-) and choose the max one in the output tensor
        # y_i<- r_i + gamma*max_a'(Q(s', a'; w-))
        with torch.no_grad():
            act_val = self.q_target(states)
            opt_val,_ = torch.max(act_val,dim=1)
            y = torch.add(torch.squeeze(rewards), self.gamma*opt_val)
        return y

    def loss(self, states, actions, target):
        """
        states: torch.Tensor of size (batch, obs_dim) with s from the dataset
        actions: torch.Tensor of size (batch, 1) with action from the dataset
        target: torch.Tensor of size (batch) with computed target (see self.compute_target)
        
        returns torch.Tensor of size (1) with squared Q error
        
        Hint: you will need the torch.gather function
        """
        act_val = self.q(states)
        act_idx = torch.squeeze(actions)
        exp_val = torch.squeeze(act_val.gather(1, act_idx.view(-1,1)))
        res = nn.functional.mse_loss(exp_val, target, size_average=None, reduce=None, reduction='mean')
        return res
    
    def __call__(self, state):
        """
        states: np.array of size (obs_dim,) with the current state
        returns np.array of size (1,) with the optimal action
        """
        # epsilon = 0.05 unstable, 0.1 unstable
        epsilon = 0.15
        state = torch.from_numpy(state).view(1,-1)
        state = state.double()
        tmp = self.q(state)[0]
        act_val = tmp.detach().numpy()  # act_val: (action_dim,)
        act_idx = np.array(range(self.actions))
        
        greedy_prob_func = np.vectorize(lambda x: 1-epsilon+epsilon/self.actions \
            if x == np.argmax(act_val) else epsilon/self.actions)
        next_act_dist = greedy_prob_func(act_idx)
        next_act = np.random.choice(act_idx, p=next_act_dist)
        return next_act

    def train_epoch(self, states, actions, reward, next_states):
        # Do not modify
        #  states and next_states: (100,100,obs_dim)
        # actions:(100,100,1), rewards:(100,100,1)
        num_runs = states.shape[0]
        len_runs = states.shape[1]
        losses = []

        for i in range(self.N_STEPS):   # 1000
            # sample batch_x of shape (512,), each within (0, 100)
            batch_x = np.random.randint(num_runs, size=(self.BATCH_SIZE,))
            # sample batch_y of shape (512,), each within (0, 100)
            batch_y = np.random.randint(len_runs, size=(self.BATCH_SIZE,))
            # batch_states (512, obs_dim), batch_actions (512, 1), 
            # batch_rewards (512, 1), batch_next_states (512, obs_dim)
            batch_states = torch.from_numpy(states[batch_x,batch_y])
            batch_actions = torch.from_numpy(actions[batch_x,batch_y]).to(int)
            batch_rewards = torch.from_numpy(reward[batch_x,batch_y])
            batch_next_states = torch.from_numpy(next_states[batch_x,batch_y])
            # target: (batch)
            target = self.compute_target(batch_next_states, batch_rewards)

            # loss: (1)
            loss = self.loss(batch_states, batch_actions, target)
             
            # clears x.grad for every parameter x in the optimizer
            # doing this before loss.backward() avoids accumulate the
            #  gradients from multiple passes
            self.optim.zero_grad()

            # loss.backward() computes dloss/dx for every parameter x which has
            # requires_grad=True, those are accumulated into x.grad for every
            # parameter x. x.grad += dloss/dx
            loss.backward()
                                                
            # optimizer.step updates the value of x using x.grad.
            self.optim.step()
            losses.append(loss.item())
        with torch.no_grad():
            # Loads a model’s parameter dictionary using a deserialized state_dict.
            self.q_target.load_state_dict(self.q.state_dict())
        return losses
        
    def train(self, task):
        # Do not modify
        losses = []
        # states: (100,101,self.obs_dim), actions:(100,100,1), rewards:(100,100,1)
        # sampled randomly meaning the actions are taken randomly
        states, actions, rewards = self.collect_data(task, random=True)

        for i in range(self.N_EPOCHS):
            # choice of policy is based on __call__()
            states, actions, rewards = self.collect_data(task, random=False)
            # first: for each run, select first 100 steps
            # last: for each run, select last 100 steps 
            epoch_losses = self.train_epoch(states[:, :-1], actions, rewards, states[:, 1:])
            losses += epoch_losses
        return losses
            
    def collect_data(self, task, random=False):
        # Do not modify
        rewards = np.zeros((100,100,1))
        states = np.zeros((100,101,self.obs_dim))
        actions = np.zeros((100,100,1))

        for run in range(100):
            obs = task.reset()
            for step in range(100):
                states[run, step] = obs
                act = np.random.choice([self(obs), np.random.randint(self.actions)], p=[0.9, 0.1])
                if random:
                    act = np.random.randint(self.actions)
                obs, rew, done, info = task.step(act)
                rewards[run, step] = rew    # reward is given by the dynamics
                actions[run, step] = act    # record that action
#             print("rewards:", rewards.shape, rewards)
            states[run, -1] = obs   # final state
            
        print(f"Average return in training: {np.mean(rewards)}")
        return states, actions, rewards

In [None]:
# Do not modify
breakout_env = Breakout()
breakout_env.main()
breakout_env.make()

agent = DQN(breakout_env.observation_space, breakout_env.action_space)

losses = agent.train(breakout_env)
plt.plot(np.arange(len(losses)), losses)
plt.savefig('3a')