# Navigation Pixels
---
This notebook is based on the notebook of DQN coding exercice of udacity nanodegree deep reinforcement learning.

We will train an agent to navigate in banana environment.

### 1. Import the Necessary Packages

In [None]:
from unityagents import UnityEnvironment
from collections import deque
from agents import AgentQ
from PIL import Image
import torch, os, numpy as np, torch.optim as optim, matplotlib.pyplot as plt, models as M
%matplotlib inline

### 2. Instantiate the Environment and Agent

Next, we will start the environment!  **_Before running the code cell below_**, change the `file_name` parameter to match the location of the Unity environment that you downloaded.

- **Mac**: `"path/to/VisualBanana.app"`
- **Windows** (x86): `"path/to/VisualBanana_Windows_x86/Banana.exe"`
- **Windows** (x86_64): `"path/to/VisualBanana_Windows_x86_64/Banana.exe"`
- **Linux** (x86): `"path/to/VisualBanana_Linux/Banana.x86"`
- **Linux** (x86_64): `"path/to/VisualBanana_Linux/Banana.x86_64"`
- **Linux** (x86, headless): `"path/to/VisualBanana_Linux_NoVis/Banana.x86"`
- **Linux** (x86_64, headless): `"path/to/VisualBanana_Linux_NoVis/Banana.x86_64"`

For instance, if you are using a Mac, then you downloaded `VisualBanana.app`.  If this file is in the same folder as the notebook, then the line below should appear as follows:
```
env = UnityEnvironment(file_name="VisualBanana.app")
```

Initialize the environment in the code cell below.

In [None]:
env = UnityEnvironment(file_name="VisualBanana_Windows_x86_64/Banana.exe")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of actions
action_size = brain.vector_action_space_size

# the state size
state_size = env_info.visual_observations[0].shape

In [None]:
def phi1(X):
    " Transform a frame into grey scale image (array) "
    temp=np.round_(np.squeeze(X)*255).astype(np.uint8)
    temp=np.array(Image.fromarray(temp).convert('L'))/255
    return np.expand_dims(temp, axis=(0, 1))                   # batch_size x in_channels x image_size : 1 x 1 x 84 x 84
    

def phi2(X):
    X=X.transpose((0,3,1,2))
    return np.hstack((np.mean(X[:,:-1],axis=1,keepdims=True),X[:,-1:])) # an image with 2 channels yellow (red+green) and blue

phi3=lambda im:im.transpose((0,3,1,2))
    

In [None]:
seed = 0
n_frames=4                    # Number of frame per state
phi=phi3                      # set frame transformer 
in_channel=3*n_frames        # total number of channels

buffer_size = 3*int(1e4)  # replay buffer size
batch_size = 64         # minibatch size

# initialize networks
qnetwork_local = M.DConvQN(in_channel,action_size, seed)
qnetwork_target = M.DConvQN(in_channel,action_size, seed)


optimizer = optim.Adam             # get an opimizer
lr = 0.00005                       # learning rate 
param_opt = dict(lr=lr)            # the optimizer parameters
device = torch.device("cuda")      # set device


agent = AgentQ(qnetwork_local, qnetwork_target, optimizer, param_opt, 
               action_size, seed, device, "double", buffer_size, batch_size, a=1., b=1.)               # define the agent 


check="Navigation_Pixels_checkpoint"        # the checkpoint folder


### 3. Train the Agent



In [None]:
def trainer_Pixels(agent,path,n_frames,phi,n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        agent: the agent to train
        path: checkpoint path
        n_frames (int): number of frame per state
        phi (callable): function transform the frame
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]                                  # reset the environment
        Xs = deque([phi(np.zeros(state_size)),]*(n_frames-1),maxlen=n_frames)              # list of in_channel last frames
        x = phi(env_info.visual_observations[0])                                           # get the current frame
        Xs.append(x)                                                                       # save the new frame
        state=np.hstack(Xs)                                                                # make state
        score = 0
        for t in range(max_t):
            action= agent.act(state, eps)
            env_info = env.step(action)[brain_name]        # send the action to the environment
            next_x = phi(env_info.visual_observations[0])  # get the next frame
            Xs.append(next_x)                              # save the new frame
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            next_state=np.hstack(Xs)                       # make state
            agent.step(state, action, reward, next_state, done)
            state=next_state
            
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        mean=np.mean(scores_window)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, mean), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, mean))
            save_path=os.path.join(path,'checkpoint_'+str(mean)+'.pth')
            torch.save(agent.qnetwork_local.state_dict(),save_path)
        
    return scores



In [None]:
scores = trainer_Pixels(agent,check,n_frames,phi)

In [None]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

### 4. Watch the Smart Agent!

In the next code cell, we will load the trained weights and evaluate the model.

In [None]:
# load the weights from file
## get all checkpoints  
checkpoints=[f for f in os.listdir(check) if f.startswith('checkpoint_') and f.endswith('.pth') ]

## find the index of maximum score
index=np.argmax([float(f[11:-4]) for f in checkpoints])

## take the optimal checkpoint
name=checkpoints[index]

print(name)

path=os.path.join(check,name)

agent.qnetwork_local.load_state_dict(torch.load(path))

for i in range(10):
    env_info = env.reset(train_mode=False)[brain_name]      # reset the environment
    Xs = deque([phi(np.zeros(state_size)),]*(n_frames-1),maxlen=n_frames)              # list of in_channel last frames
    x = phi(env_info.visual_observations[0])                                           # get the current frame
    Xs.append(x)                                                                       # save the new frame
    state=np.hstack(Xs)                                                                # make state
    score=0                                                 # initialise score
    for j in range(1000):
        action = agent.act(state)
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_x = phi(env_info.visual_observations[0])  # get the next frame
        Xs.append(next_x)                              # save the new frame
        next_state=np.hstack(Xs)                       # make state
        score += env_info.rewards[0]                   # update score
        done = env_info.local_done[0]                  # see if episode has finished
        if done:
            print("trail :",i+1,"score :",score)
            break 
            
env.close()