In [19]:
import math
import random
from collections import deque
from collections import namedtuple

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from matplotlib.pyplot import plot

%matplotlib inline

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
EpsilonGreedyPolicy = namedtuple('EpsilonGreedyPolicy', ('start', 'end', 'decay'))

In [22]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=capacity)
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        experiences = random.sample(self.memory, k=batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.memory)


In [23]:
class DQN(nn.Module):
    def __init__(self, input_size=37, output_size=4):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # x = F.softmax(x, dim=1)  # why should I not use softmax here?
        return x


In [24]:
BATCH_SIZE = 64
GAMMA = 0.99
TAU = 1e-3
LR = 5e-4
UPDATE_EVERY = 4
DEFAULT_GREEDY_POLICY = EpsilonGreedyPolicy(start=0.9, end=0.05, decay=200)


class Agent(object):
    def __init__(self, n_states=37, n_actions=4, replay_memory_size=10000, greedy_policy=DEFAULT_GREEDY_POLICY):
        self.n_states = n_states
        self.n_actions = n_actions
        
        self.replay = ReplayMemory(replay_memory_size)
        self.qnetwork = DQN()
        self.qnetwork_target = DQN()
        
        self.qnetwork_target.load_state_dict(self.qnetwork.state_dict())
        self.qnetwork_target.eval()
        
        self.optimizer = optim.Adam(self.qnetwork.parameters(), lr=LR)
        
        self.greedy_policy = greedy_policy
        self.n_steps = 0
        self.t_steps = 0

    def select_action(self, state):
        sample = random.random()
        threshold = self.greedy_policy.end + (self.greedy_policy.start - self.greedy_policy.end) * math.exp(-1. * self.n_steps / self.greedy_policy.decay)
        
        self.n_steps += 1
        if sample > threshold:
            with torch.no_grad():
                return self.qnetwork(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.push(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        if self.n_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        # obtain random minibatch of tuples (Sj, Aj, Rj, Sj+1)
        # set target yi = ri + gamma * maxa q(St+1, a, w-)
        # update 𝞓w
        # every n steps w- <- w
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qnetwork(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(q_expected, q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork, self.qnetwork_target, TAU)  

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [25]:
AVERAGE_WINDOW_SCORE = 100
AVERAGE_CUTOFF_SCORE = 13.0


def train_agent(agent, env, n_episodes=2000, max_t=1000):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=AVERAGE_WINDOW_SCORE)  # last 100 scores

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        
        if i_episode % AVERAGE_WINDOW_SCORE == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        
        if np.mean(scores_window) >= AVERAGE_CUTOFF_SCORE:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break

    return scores


In [26]:
## Initializing the Unity environment
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name="Banana.app")

brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)

state_size = len(state)
print('States have length:', state_size)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


Number of actions: 4
States look like: [1.         0.         0.         0.         0.84408134 0.
 0.         1.         0.         0.0748472  0.         1.
 0.         0.         0.25755    1.         0.         0.
 0.         0.74177343 0.         1.         0.         0.
 0.25854847 0.         0.         1.         0.         0.09355672
 0.         1.         0.         0.         0.31969345 0.
 0.        ]
States have length: 37


In [30]:
class UnityEnvironmentWrapper:
    def __init__(self, env, brain_name):
        self.env = env
        self.brain_name = brain_name
    
    def step(self, action):
        env_info = self.env.step(action)[self.brain_name]
        next_state = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        _ = 0  # to follow the same API
        return next_state, reward, done, _

    def reset(self):
        self.env.reset(train_mode=True)[self.brain_name]

environment = UnityEnvironmentWrapper(env=env, brain_name=brain_name)
agent = Agent(n_actions=action_size, n_states=state_size)
train_agent(agent, environment, n_episodes=100)

AttributeError: 'Tensor' object has no attribute 'keys'