In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
model_dir = '/content/gdrive/My Drive/Winter 2020/DL/Project 2/models/'
!cp /content/gdrive/My\ Drive/Winter\ 2020/DL/Project\ 2/data/*.zip .
!unzip /content/sudoku.zip
!mkdir /content/test
!unzip /content/sudoku_test.zip -d /content/test
!mv /content/test/sudoku.csv /content/sudoku_test.csv

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
Archive:  /content/sudoku.zip
  inflating: sudoku.csv              
Archive:  /content/sudoku_test.zip
  inflating: /content/test/sudoku.csv  


In [2]:
!git clone https://github.com/cloughurd/drl-sudoku.git
!mv drl-sudoku/rl/* .

Cloning into 'drl-sudoku'...
remote: Enumerating objects: 250, done.[K
remote: Counting objects: 100% (250/250), done.[K
remote: Compressing objects: 100% (196/196), done.[K
remote: Total 250 (delta 150), reused 106 (delta 46), pack-reused 0[K
Receiving objects: 100% (250/250), 380.17 KiB | 4.18 MiB/s, done.
Resolving deltas: 100% (150/150), done.


In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import json

assert torch.cuda.is_available()
from IPython.core.ultratb import AutoFormattedTB
__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)

from env.fullgrid import GridEnv
from helpers import prepare_batch, learn_dqn, get_action_dqn
from qnetwork import QNetwork

In [4]:
def dqn_main(num_epochs=50000):
    # Hyper parameters
    lr = 1e-3
    start_training = 1000
    gamma = 0.99
    batch_size = 32
    epsilon = 1
    epsilon_decay = .9999
    target_update = 1000
    learn_frequency = 4

    # Init environment
    action_size = 9*81
    env = GridEnv('/content/sudoku.csv', max_len=5000000)

    # Init networks
    q_network = QNetwork(18, action_size).cuda()
    target_network = QNetwork(18, action_size).cuda()
    target_network.load_state_dict(q_network.state_dict())

    # Init optimizer
    optim = torch.optim.Adam(q_network.parameters(), lr=lr)

    # Init replay buffer
    memory = []

    total_learnings = 0

    # Begin main loop
    save_freq = 5000
    results_dqn = []
    losses = []
    reward_curves = {}
    global_step = 0
    loop = tqdm(total=num_epochs, position=0, leave=False)
    for epoch in range(num_epochs):
        # New puzzle
        state, goal = env.reset()
        done = False
        cum_reward = 0  # Track cumulative reward per episode
        rewards = []
        pos_count = 0

        # Begin episode
        while not done and abs(cum_reward) < 16:
            # Select e-greedy action
            action, epsilon = get_action_dqn(q_network, state, epsilon, epsilon_decay)

            # Take step
            next_state, reward, done = env.act(state, action, goal)
            # env.render()

            # Store step in replay buffer
            memory.append((state, action, next_state, reward, done))

            if reward >= 0:
              pos_count += 1
            cum_reward += reward
            rewards.append(reward)
            global_step += 1  # Increment total steps
            state = next_state  # Set current state

            # If time to train
            if global_step > start_training and global_step % learn_frequency == 0:
                total_learnings += 1

                # Sample batch
                batch = prepare_batch(memory, batch_size)

                # Train
                loss = learn_dqn(batch, optim, q_network, target_network, gamma, global_step, target_update)
                losses.append((global_step, loss))

        # Print results at end of episode
        results_dqn.append(cum_reward)
        reward_curves[epoch] = rewards
        loop.update(1)
        loop.set_description('Episodes: {} Reward: {} Epsilon: {:.4f} Positive Reward Count: {}'.format(epoch, cum_reward, epsilon, pos_count))
        
        if epoch+1 % save_freq == 0:
            torch.save(q_network.state_dict(), model_dir + f'rl-{epoch}.mod')
            json.dump({'rewards': reward_curves, 'loss': losses},
                      open(model_dir + f'rl-results-{epoch}.json', 'w'))

    print(total_learnings)
    return results_dqn

results_dqn = dqn_main()

Episodes: 3999 Reward: -16 Epsilon: 0.0785 Positive Reward Count: 0:   8%|▊         | 4000/50000 [44:25<8:52:03,  1.44it/s]

KeyboardInterrupt: ignored

In [0]:
plt.plot(results_dqn)
plt.show()