# Install Dependencies

In [0]:
!pip install gym
!pip install box2d_py

Collecting box2d_py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 2.8MB/s 
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8


# Check if we are allocated a GPU



# Connect to Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
%cd /content/drive/My Drive/BipedalWalker-v3/DDPG

/content/drive/.shortcut-targets-by-id/1TyolH62paiFvrPtkZ3ZJunv4rqrxh7Nz/progettoDataDriven/gym_BipedalWalker-v3/DDPG/1_DDPG_Pytorch


In [4]:
import gym
import torch
import numpy as np
from ddpg_agent import Agent
import matplotlib.pyplot as plt
import pickle
from collections import deque

gym.logger.set_level(40)
env = gym.make('BipedalWalker-v3')

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

BATCH_SIZE = 64

MAX_EPISODES = 20000
MAX_REWARD = 300
MAX_STEPS = 2000  # env._max_episode_steps
BUFFER_SIZE = int(1e5)  # replay buffer size
GAMMA = 0.99  # discount factor
TAU = 1e-3  # for soft update of target parameters
LR = 5e-4  # learning rate
UPDATE_EVERY = 4  # how often to update the network
MEAN_EVERY = 100

start_episode = 0

agent = Agent(state_size=state_dim, action_size=action_dim, random_seed=0)

LOAD = True
noise = 1

if LOAD:
    start_episode = 18900
    agent.actor_local.load_state_dict(torch.load('./actor/checkpoint_actor_ep18900.pth', map_location="cpu"))
    agent.critic_local.load_state_dict(torch.load('./critic/checkpoint_critic_ep18900.pth', map_location="cpu"))
    agent.actor_target.load_state_dict(torch.load('./actor/checkpoint_actor_t_ep18900.pth', map_location="cpu"))
    agent.critic_target.load_state_dict(torch.load('./critic/checkpoint_critic_t_ep18900.pth', map_location="cpu"))

scores = []
mean_scores = []
last_scores = deque(maxlen=MEAN_EVERY)
distances = []
mean_distances = []
last_distance = deque(maxlen=MEAN_EVERY)
losses_mean_episode = []

for ep in range(start_episode + 1, MAX_EPISODES + 1):
    state = env.reset()
    total_reward = 0
    total_distance = 0
    actor_losses = []
    critic_losses = []
    for t in range(MAX_STEPS):

        # env.render()

        action = agent.act(state, noise)
        next_state, reward, done, info = env.step(action[0])
        next_state = next_state
        actor_loss, critic_loss = agent.step(state, action, reward, next_state, done)
        if actor_loss is not None:
            actor_losses.append(actor_loss)
        if critic_loss is not None:
            critic_losses.append(critic_loss)
        state = next_state.squeeze()
        state = next_state
        total_reward += reward
        if reward != -100:
            total_distance += reward
        if done:
            break

    if len(actor_losses) >= 1 and len(critic_losses) >= 1:
        mean_loss_actor = np.mean(actor_losses)
        mean_loss_critic = np.mean(critic_losses)
        losses_mean_episode.append((ep, mean_loss_actor, mean_loss_critic))
    else:
        mean_loss_actor = None
        mean_loss_critic = None

    print(
        '\rEpisode: {}/{},\tScore: {:.2f},\tDistance: {:.2f},\tactor_loss: {},\tcritic_loss:{}'.format(ep, MAX_EPISODES,
                                                                                                       total_reward,
                                                                                                       total_distance,
                                                                                                       mean_loss_actor,
                                                                                                       mean_loss_critic),
        end="")

    scores.append(total_reward)
    distances.append(total_distance)
    last_scores.append(total_reward)
    last_distance.append(total_distance)
    mean_score = np.mean(last_scores)
    mean_distance = np.mean(last_distance)
    FILE = 'record.dat'
    data = [ep, total_reward, total_distance, mean_loss_actor, mean_loss_critic]
    with open(FILE, "ab") as f:
        pickle.dump(data, f)

    if mean_score >= 300:
        print('Task Solved')
        torch.save(agent.actor_local.state_dict(), './actor/checkpoint_actor_best_ep' + str(ep) + '.pth')
        torch.save(agent.critic_local.state_dict(), './critic/checkpoint_critic_best_ep' + str(ep) + '.pth')
        torch.save(agent.actor_target.state_dict(), './actor/checkpoint_actor_best_t_ep' + str(ep) + '.pth')
        torch.save(agent.critic_target.state_dict(), './critic/checkpoint_critic_best_t_ep' + str(ep) + '.pth')
        break

    if ((ep % MEAN_EVERY) == 0):
        torch.save(agent.actor_local.state_dict(), './actor/checkpoint_actor_ep' + str(ep) + '.pth')
        torch.save(agent.critic_local.state_dict(), './critic/checkpoint_critic_ep' + str(ep) + '.pth')
        torch.save(agent.actor_target.state_dict(), './actor/checkpoint_actor_t_ep' + str(ep) + '.pth')
        torch.save(agent.critic_target.state_dict(), './critic/checkpoint_critic_t_ep' + str(ep) + '.pth')
        mean_scores.append(mean_score)
        mean_distances.append(mean_distance)
        print('\rEpisode: {}/{},\tMean Score: {:.2f},\tMean Distance: {:.2f},\tactor_loss: {},\tcritic_loss:{}'.format(
            ep, MAX_EPISODES,
            mean_score,
            mean_distance, mean_loss_actor,
            mean_loss_critic))
        FILE = 'record_mean.dat'
        data = [ep, mean_score, mean_distance, mean_loss_actor, mean_loss_critic]
        with open(FILE, "ab") as f:
            pickle.dump(data, f)
env.close()


Episode: 19000/20000,	Mean Score: -109.16,	Mean Distance: -9.16,	actor_loss: 1.3522928953170776,	critic_loss:3.7594549655914307
Episode: 19100/20000,	Mean Score: -109.09,	Mean Distance: -9.09,	actor_loss: 3.221298933029175,	critic_loss:3.918891668319702
Episode: 19200/20000,	Mean Score: -105.06,	Mean Distance: -5.06,	actor_loss: 3.8261525630950928,	critic_loss:3.7365567684173584
Episode: 19300/20000,	Mean Score: -104.06,	Mean Distance: -4.06,	actor_loss: 2.9740633964538574,	critic_loss:3.3186819553375244
Episode: 19400/20000,	Mean Score: -100.50,	Mean Distance: -0.50,	actor_loss: 2.3565618991851807,	critic_loss:3.8741800785064697
Episode: 19500/20000,	Mean Score: -101.83,	Mean Distance: -1.83,	actor_loss: 1.7265989780426025,	critic_loss:4.448746681213379
Episode: 19600/20000,	Mean Score: -106.94,	Mean Distance: -6.94,	actor_loss: 0.46157369017601013,	critic_loss:4.289445400238037
Episode: 19700/20000,	Mean Score: -107.16,	Mean Distance: -8.16,	actor_loss: 0.7497286796569824,	critic_los