In [None]:
%reload_ext autoreload
%autoreload 2
%pylab inline

import os
import pickle
import random
from itertools import count

import gym
import numpy as np
import pycuber
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
from gym import spaces
from gym.utils import seeding
from tensorboardX import SummaryWriter
from torch.distributions import Categorical

import cube
import dqn
import solver

In [None]:
def test_agent(env, policy_net, n_episodes, max_episodes_length):
    test_reward = []
    test_entropy = []
    n_success_episodes = 0
    
    for i_episode in range(n_episodes):
        episode_reward = 0
        state = env.reset().view(1, -1)
        
        for t in range(max_episodes_length):
            output = policy_net(state).detach()
            action = output.max(1)[1]
            categorical = Categorical(logits=output)
            test_entropy.append(float(categorical.entropy()))

            state, reward, done, _ = env.step(action.item())
            state = state.view(1, -1)
            episode_reward += reward
            if done:
                n_success_episodes += 1
                break

        test_reward.append(episode_reward)
    
    return (
        float(n_success_episodes) / n_episodes,
        np.mean(test_reward),
        np.mean(test_entropy))

In [None]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    
    transitions = memory.sample(BATCH_SIZE)
    batch = dqn.Transition(*zip(*transitions))

    non_final_mask = torch.tensor(
        tuple(map(lambda s: s is not None, batch.next_state)),
        dtype=torch.uint8)
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None])
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = policy_net(state_batch).gather(
        1, action_batch.view(-1, 1))

    # compute V(s_{t+1}) for all next states.
    next_state_values = torch.zeros(BATCH_SIZE)
    next_state_values[non_final_mask] = target_net(
        non_final_next_states).max(1)[0].detach()
    
    # compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    loss = F.mse_loss(
        state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    
    weights = policy_net.state_dict()
    weights_norm = np.mean(
        [np.mean(w.numpy() ** 2) for _, w in weights.items()])
    writer.add_scalar('network/norm_weights', weights_norm, steps_done)

In [None]:
def train(learn_env, test_env, num_episodes):
    global steps_done
    global episode_durations
    
    for i_episode in range(num_episodes):
        state = learn_env.reset().view(1, -1)
        
        for t in count():
            action = policy_net.sample_action(state)
            next_state, reward, done, _ = learn_env.step(action.item())
            next_state = next_state.view(1, -1)
            reward = torch.FloatTensor([reward])

            if done:
                next_state = None

            memory.push(state, action, next_state, reward)
            state = next_state

            if done or t == max_train_episode_length - 1:
                episode_durations.append(t + 1)
                break

        optimize_model()
        if steps_done % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if steps_done % TEST_PERIOD == 0:
            success_rate, avg_reward, avg_entropy = \
                test_agent(
                    test_env, policy_net, n_episodes=100,
                    max_episodes_length=100)
            print(steps_done, success_rate, avg_reward)
            
            writer.add_scalar('cube/success_rate', success_rate, steps_done)
            writer.add_scalar('cube/avg_reward', avg_reward, steps_done)
            writer.add_scalar('policy/avg_entropy', avg_entropy, steps_done)
            
        if steps_done and steps_done % SAVE_PERIOD == 0:
            pickle.dump(policy_net.state_dict(), open(
                'weights/dqn_steps_10_mean_100_gamma_0.9_batch_512_episodes_{}.pkl'.format(steps_done), 'wb'))
            
        steps_done += 1

In [None]:
EPS = 1e-100
BATCH_SIZE = 512
GAMMA = 0.9
TAU = 1.0

TARGET_UPDATE = 500
TEST_PERIOD = 1000
SAVE_PERIOD = 50000

In [None]:
policy_net = dqn.DQN(n_space=cube.N_SPACE, n_action=cube.N_ACTION)
target_net = dqn.DQN(n_space=cube.N_SPACE, n_action=cube.N_ACTION)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters())
memory = dqn.ReplayMemory(1000000)
steps_done = 0

writer = SummaryWriter(log_dir='logs/dqn_steps_10_scale_100_batch_512')

In [None]:
learn_env = cube.CubeEnv(steps=10, reward_scale=100)
test_env = cube.CubeEnv(steps=5)

episode_durations = []
max_train_episode_length = 50

train(learn_env=learn_env, test_env=test_env, num_episodes=100000)

In [None]:
pickle.dump(
    policy_net.state_dict(),
    open('weights/dqn_steps_10_mean_100_gamma_0.9_batch_512_episodes_100000.pkl', 'wb'))

In [None]:
test_env = cube.CubeEnv(steps=5)

success_rate, avg_reward, avg_entropy = \
    test_agent(test_env, policy_net, n_episodes=1000, max_episodes_length=100)

print(success_rate)

In [None]:
test_env = cube.CubeEnv(steps=10)

success_rate, avg_reward, avg_entropy = \
    test_agent(test_env, policy_net, n_episodes=1000, max_episodes_length=100)

print(success_rate)