In [1]:
import sys
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import copy
from common_rl_lib import *
sys.path.append('p3')
import gridworld as gw


In [2]:
"""Prediction Algorithms - using dynamics model
LA and DP algorithms for solving small MDPs directly or iteratively
from generated MDPs - to get the true value function and compare it to the sampling methods below
"""
class LASolver: # solve using linear algebra v = (1 - discount*dynamics)^-1*rewards
    def __init__(self, mdp, policy=None):
        self.mdp = mdp
        self.policy = policy # if None, then this is find the optimal policy
        if self.mdp.deterministic:
            self.det_solve()
        else:
            self.sto_solve()

class ValueIterationSolver:
    def __init__(self, mdp, policy=None, discount=0.9, threshold=1e-3):
        self.mdp = mdp
        self.policy = policy
        self.givenPolicy = policy is not None
        self.discount = discount
        self.values = SafeDict()
        self.threshold = threshold # threshold for convergence
        self.solved = False
        self.iterations = 0 # iterations completed
    
    def _improveValues(self): # internal function
        self.iterations += 1
        newValues = SafeDict()
        max_delta = 0
        for s in self.mdp.state_space():
            if self.givenPolicy:
                newValues[s] = self.computeQValueFromValues(s, self.policy.decide(s, self.mdp.action_space(s)))
            else:
                options = [ self.computeQValueFromValues(s, a) for a in self.mdp.action_space(s)]
                if len(options): # not terminal state
                    newValues[s] = max(options)
            error = np.abs(self.values[s] - newValues[s])
            if error > max_delta:
                max_delta = error
        self.values = newValues
        return max_delta
    
    def computeQValueFromValues(self, state, action):
        return sum([ prob * (self.mdp.getReward(state=state, action=action, newstate=sp) + (self.discount * self.values[sp])) 
                for sp, prob in self.mdp.getTransitions(state=state, action=action)])
    
    def slowCompute(self, state, action):
        parts = [ prob * (self.mdp.getReward(state=state, action=action, newstate=sp) + (self.discount * self.values[sp])) 
                for sp, prob in self.mdp.getTransitions(action=action,state=state)]
        #print '--res s:', state, 'a:', action, 'calc:', parts
        return sum(parts)
    
    def iterate(self, iterations=10): # iterates regardless of whether mdp is already solved
        delta = 0
        for _ in range(iterations):
            delta = self._improveValues()
        if delta < self.threshold: self.solved = True
        return delta # from last iteration
    
    def solve(self, threshold=None): # only solves in passed in threshold is smaller than solved threshold
        if self.solved and threshold is not None and threshold >= self.threshold:
            return -1 # mdp is already solved
        if threshold is not None:
            self.threshold = threshold
            self.solved = False
        converged = False
        delta = 0
        while not converged:
            delta = self._improveValues()
            converged = delta < self.threshold
        self.solved = True
        return self.iterations # total number of iterations completed
            
    def getValue(self, state):
        return self.values[state]
    
    def getPolicy(self):
        if not self.givenPolicy:
            if not self.solved: self.solve()
            self.policy = Policy(decision=self.computeActionFromValues)
        return self.policy
    
    def computeActionFromValues(self, state):
        actions = self.mdp.getPossibleActions(state)
        if not len(actions):
            return None
        values = [self.computeQValueFromValues(state, a) for a in actions]
        return max(zip(actions,values),key=lambda x: x[1])[0]

In [3]:
"""Run Value Iteration on Gridworld MDPs"""
for name in gridWorlds.keys():
    print name, ValueIterationSolver(GridMDP(name)).solve() # prints number of iterations until convergence

bridge 5
discount 30
book 16
maze 23
cliff2 17
cliff 21


In [4]:
"""Model-Free Prediction Algorithms - for policy evaluation
Standard Monte Carlo - only adjusts value function at the end of the episode
TD(0) - bootstraps each step to incrementally improve value function
TD(lambda) - bootstraps with eligibility function, to change between MC and TD
"""
class MC:
    def __init__(self, policy=Policy(), alpha=0.1, discount=0.9):
        self.alpha = alpha
        self.discount = discount
        self.values = SafeDict()
        self.policy = policy
        self.episode = [] # keeps track of visited states in this episode
        self.gain = 0
        self.state = None
        
    def take_action(self, actions):
        return self.policy.decide(self.state, actions)
    
    def eval_action(self, newstate, reward, newactions):
        self.episode.append(newstate)
        self.state = newstate
        self.gain += reward
        return self.gain
    
    def reset(self): # for MC this is where value function is improved
        total_discount = 1
        while len(self.episode):
            s = self.episode.pop()
            self.values[s] += self.alpha * (total_discount * self.gain - self.values[s])
            total_discount *= self.discount
        self.episode = []
        self.gain = 0
        self.state = None

class TD:
    def __init__(self, policy=Policy(), alpha=0.1, discount=0.9):
        self.alpha = alpha
        self.discount = discount
        self.values = SafeDict() # value function
        self.state = None
        self.policy = policy
    
    def take_action(self, actions): # reward is from previous action
        return self.policy.decide(self.state, actions)
    
    def eval_action(self, newstate, reward, newactions):
        if self.state is not None:
            self.values[self.state] += self.alpha * (reward + self.discount * self.values[newstate] - self.values[self.state])
            exp_reward = self.values[self.state]
        self.state = newstate
        return self.values[self.state] # return metric (eg. expected reward, loss, etc.)
    
    def reset(self):
        self.state = None

class TD_lambda:
    def __init__(self, policy=Policy(), alpha=0.1, discount=0.9, lmbda=0.9, threshold=1e-6):
        self.alpha = alpha
        self.discount = discount
        self.lmbda = lmbda
        self.threshold = threshold # remove insignificantly small elig. states to improve performance
        self.values = SafeDict() # value function
        self.policy = policy
        self.eligibility = SafeDict()
        self.state = None
    
    def take_action(self, actions): # reward is from previous action
        return self.policy.decide(self.state, actions)
    
    def eval_action(self, newstate, reward, newactions):
        if self.state is not None:
            for s in self.eligibility.keys():
                self.eligibility[s] *= self.discount * self.lmbda
                if self.eligibility[s] < self.threshold: del self.eligibility[s] # remove insignificant states
            self.eligibility[self.state] += 1
            for s in self.eligibility.keys():
                self.values[s] += self.alpha * self.eligibility[s] * (reward + self.discount * self.values[newstate] - self.values[self.state])
        self.state = newstate
        return self.values[self.state]
    
    def reset(self): # reset at the end of an episode
        self.eligibility = SafeDict()
        self.state = None

In [5]:
"""Model-Free Prediction: Setup/Reset Run
    (Policy evaluation)"""

learner = TD_lambda() # random policy
watcher = MC()
watcher2 = TD()
test_mdp = GridMDP()
solver = ValueIterationSolver(test_mdp)
print test_mdp.gridName
completed_episodes = 0

solver.solve()

def calcMSE():
    solution = np.array([solver.values[s] for s in test_mdp.state_space()])
    #for s in test_mdp.state_space():
    #    print s, solver.values[s], [a.values[s] for a in algs]
    algs = [learner, watcher, watcher2]
    MSE = [np.mean((np.array([a.values[s] for s in test_mdp.state_space()]) - solution)**2) for a in algs]
    #print 'MSE', MSE

book


In [6]:
"""Model-Free Prediction: Runner"""

episodes = 100
printstep = episodes / 100 if episodes > 100 else 1
steps = 20
for i in range(episodes): # number of episodes
    episode_len = 0
    test_mdp.reset()
    learner.reset()
    watcher.reset()
    watcher2.reset()
    if i % printstep == 0: print '\nEpsiode', i, '\n   ',
    state = None
    reward = None
    done = len(test_mdp.action_space()) == 0
    for _ in range(steps): # steps per episode
        if done: break
        actions = test_mdp.action_space()
        state, reward, done = test_mdp.step(learner.take_action(actions))
        learner.eval_action(state, reward, actions)
        watcher.eval_action(state, reward, actions)
        watcher2.eval_action(state, reward, actions)
        #state = state_maker(observation) # only neede if ai gym env is used
        episode_len += 1
    if i % printstep == 0: print '\n--- len:', episode_len



Epsiode 0 
    
--- len: 20

Epsiode 1 
    
--- len: 19

Epsiode 2 
    
--- len: 12

Epsiode 3 
    
--- len: 13

Epsiode 4 
    
--- len: 20

Epsiode 5 
    
--- len: 20

Epsiode 6 
    
--- len: 20

Epsiode 7 
    
--- len: 13

Epsiode 8 
    
--- len: 20

Epsiode 9 
    
--- len: 15

Epsiode 10 
    
--- len: 20

Epsiode 11 
    
--- len: 19

Epsiode 12 
    
--- len: 20

Epsiode 13 
    
--- len: 7

Epsiode 14 
    
--- len: 20

Epsiode 15 
    
--- len: 20

Epsiode 16 
    
--- len: 20

Epsiode 17 
    
--- len: 20

Epsiode 18 
    
--- len: 20

Epsiode 19 
    
--- len: 13

Epsiode 20 
    
--- len: 20

Epsiode 21 
    
--- len: 20

Epsiode 22 
    
--- len: 20

Epsiode 23 
    
--- len: 5

Epsiode 24 
    
--- len: 13

Epsiode 25 
    
--- len: 20

Epsiode 26 
    
--- len: 20

Epsiode 27 
    
--- len: 20

Epsiode 28 
    
--- len: 18

Epsiode 29 
    
--- len: 20

Epsiode 30 
    
--- len: 20

Epsiode 31 
    
--- len: 9

Epsiode 32 
    
--- len: 9

Epsiode 33 
    
--- le

In [34]:
"""Model-Free Prediction: Analysis"""




TERMINAL_STATE 0.0 [0.0, -4.7340163, 0.0]
(0, 0) -100.0 [0.0, 0.0, 0.0]
(0, 1) 8.0 [3.7484720000000005, 3.373624800000001, 3.7484720000000005]
(0, 2) 6.92629886706 [0.0, 0.0, 0.0]
(1, 0) -100.0 [-10.0, -9.0, -10.0]
(1, 1) 5.58720954652 [1.4988356712000004, 1.6452720000000003, 0.20160000000000003]
(1, 2) 6.03268429233 [1.3665244199602646, 1.8575876520900005, 0.005248800000000001]
(2, 0) -100.0 [-10.0, -9.0, -10.0]
(2, 1) 5.81985049675 [-5.346873132125981, -6.817608, 0.0]
(2, 2) 6.52979280239 [2.021392223620745, 2.646546844456801, 0.0]
(3, 0) -100.0 [0.0, 0.0, 0.0]
(3, 1) 6.84213664784 [2.2284414294335777, 0.8100000000000002, 0.09000000000000001]
(3, 2) 7.52564386773 [2.082541470127543, 1.1875410000000002, 0.0]
(4, 0) -100.0 [0.0, 0.0, 0.0]
(4, 1) 10.0 [1.9, 0.9, 1.9]
(4, 2) 8.65637113418 [0.0, 0.0, 0.0]
MSE [1911.2230389949618, 2358.4452626924085, 1682.8905128940473]


In [4]:
"""Model-Free Control
On policy: SARSA - similar to TD, but now with epsilon greedy action choices\
           SARSA(lambda) - similar to TD(lambda)
Off policy: Q-learning
"""
class SARSA: # on policy - so policy is run while improving
    def __init__(self, policy=None, alpha=0.1, discount=0.9, epsilon=0.1):
        self.alpha = alpha
        self.discount = discount
        self.explore = lambda: epsilon > np.random.rand() # epsilon-greedy - constant epsilon
        self.Q = SafeDict() # Q function
        self.policy = policy
        if self.policy is None:
            self.policy = self.getPolicy() # epsilon greedy policy
        self.action = None
        self.state = None # current
    
    def take_action(self, actions):
        if self.action is None:
            self.action = self.policy.decide(self.state, actions)
        return self.action # already decided at last evaluation
    
    def eval_action(self, newstate, reward, newactions):
        if self.state is not None:
            newaction = self.policy.decide(newstate, newactions) if len(newactions) else None
            self.Q[self.state, self.action] += self.alpha * (reward + self.discount * self.Q[newstate, newaction] - self.Q[self.state, self.action])
            self.action = newaction
        self.state = newstate
        return self.Q[self.state, self.action]
    
    def __chooseAction(self, state): # epsilon-greedy policy
        if not self.explore(): # exploit
            options = [(a, self.Q[s,a]) for s,a in [key for key in self.Q.keys() if key[0]==state]]
            if len(options):
                return max(options, key=lambda x: x[1])[0] # argmax
        return None # policy will pick a random action
    
    def getPolicy(self, epsilon=None): # updates current policy if a prior policy was given
        if epsilon is not None:
            self.epsilon = epsilon
        self.policy = Policy(decision=self.__chooseAction)
        return self.policy
    
    def getValues(self):
        self.V = SafeDict()
        for state, _ in self.Q.keys():
            options = [self.Q[s,a] for s,a in [key for key in self.Q.keys() if key[0]==state]]
            if len(options):
                self.V[state] = max(options)
        return self.V
    
    def reset(self): # reset at the end of an episode
        self.state = None
        self.action = None

class SARSA_lambda: # on policy - so policy is run while improving
    def __init__(self, policy=None, alpha=0.1, discount=0.9, epsilon=0.1, lmbda=0.9, threshold=1e-6):
        self.alpha = alpha
        self.discount = discount
        self.lmbda = lmbda
        self.explore = lambda: epsilon > np.random.rand() # epsilon-greedy - constant epsilon
        self.threshold = threshold # remove insignificantly small elig. states to improve performance
        self.Q = SafeDict() # Q function
        self.E = SafeDict() # eligibility
        self.policy = policy
        if self.policy is None:
            self.policy = self.getPolicy() # epsilon greedy policy
        self.action = None
        self.state = None # current
    
    def take_action(self, actions):
        if self.action is None:
            self.action = self.policy.decide(self.state, actions)
        return self.action # already decided at last evaluation
    
    def eval_action(self, newstate, reward, newactions):
        if self.state is not None:
            newaction = self.policy.decide(newstate, newactions) if len(newactions) else None
            for sa in self.E.keys():
                self.E[sa] *= self.discount * self.lmbda
                if self.E[sa] < self.threshold: del self.E[sa] # remove insignificant states
            self.E[self.state, self.action] += 1
            for sa in self.E.keys():
                self.Q[sa] += self.alpha * self.E[sa] * (reward + self.discount * self.Q[newstate, newaction] - self.Q[self.state, self.action])
            self.action = newaction
        self.state = newstate
        return self.Q[self.state, self.action]
    
    def reset(self): # reset at the end of an episode
        self.E = SafeDict()
        self.state = None
        self.prevstate = None
        self.prevaction = None
    
    def __chooseAction(self, state): # epsilon-greedy policy
        if not self.explore(): # exploit
            options = [(a, self.Q[s,a]) for s,a in [key for key in self.Q.keys() if key[0]==state]]
            if len(options):
                return max(options, key=lambda x: x[1])[0] # argmax
        return None # policy will pick a random action
    
    def getPolicy(self, epsilon=None): # updates current policy if a prior policy was given
        if epsilon is not None:
            self.epsilon = epsilon
        self.policy = Policy(decision=self.__chooseAction)
        return self.policy
    
    def getValues(self):
        self.V = SafeDict()
        for state, _ in self.Q.keys():
            options = [self.Q[s,a] for s,a in [key for key in self.Q.keys() if key[0]==state]]
            if len(options):
                self.V[state] = max(options)
        return self.V
    
class Q_learning: # off policy - does not choose action, merely observes
    def __init__(self, policy=Policy(), alpha=0.1, discount=0.9):
        self.alpha = alpha
        self.policy = policy
        self.discount = discount
        self.Q = SafeDict() # Q function
        self.prevstate = None
        self.prevaction = None
    
    def step(self, state, action, reward): # reward is from previous action into state
        if self.prevstate is not None and self.prevaction is not None: # initial move
            bestAction = self.__chooseAction(state)
            if bestAction is None:
                bestAction = action # if you don't know any better trust the action of the supervisor
            self.Q[self.prevstate, self.prevaction] += self.alpha * (reward 
                                                                     + self.discount * self.Q[state, bestAction]
                                                                     - self.Q[self.prevstate, self.prevaction])
        self.prevstate = state
        self.prevaction = action
        return self.Q[state, action]
    
    def reset(self):
        self.prevstate = None
        self.prevaction = None
    
    def __chooseAction(self, state): # epsilon-greedy policy
        options = [(a, self.Q[s,a]) for s,a in [key for key in self.Q.keys() if key[0]==state]]
        if len(options):
            return max(options, key=lambda x: x[1])[0] # argmax
        return None # policy will pick a random action
    
    def getValues(self):
        self.V = SafeDict()
        for state, _ in self.Q.keys():
            options = [self.Q[s,a] for s,a in [key for key in self.Q.keys() if key[0]==state]]
            if len(options):
                self.V[state] = max(options)
        return self.V
    
    def getPolicy(self): # build exploitative policy from Q function
        self.policy = Policy(decision=self.__chooseAction)
        return self.policy
        

In [5]:
"""Model-free Control: Setup/Reset Run"""
learner = SARSA() # on-policy
watcher = Q_learning() # off-policy
test_mdp = GridMDP(gridName=None) # randomly chosen grid
print test_mdp.gridName
gains = []
completed_episodes = 0

book


In [6]:
"""Model-Free Control: Runner"""
episodes = 100
printstep = episodes / 100 if episodes > 100 else 1
steps = 50
for i in range(episodes): # number of episodes
    test_mdp.reset()
    learner.reset()
    watcher.reset()
    
    if i % printstep == 0: print 'Epsiode', i+completed_episodes,
    gain = 0
    episode_len = 0
    state = test_mdp.state
    learner.state = state # initial state
    watcher.state = state # initial state
    reward = None
    while True:
    #for _ in range(steps): # steps per episode
        action = learner.take_action(test_mdp.action_space())
        state, reward, done = test_mdp.step(action)
        learner.eval_action(state, reward, test_mdp.action_space())
        watcher.step(state, action, reward)
        #observation, reward, done, info = env.step(np.array([learner.take_action(state, reward, actions)]))
        #state = state_maker(observation)
        gain += reward
        episode_len += 1
        if done: break
    if i % printstep == 0: print 'len:', episode_len, '- gain:', gain
    gains.append(gain)
completed_episodes += episodes

Epsiode 0

TypeError: __chooseAction() takes exactly 2 arguments (3 given)

In [7]:
"""Model-Free Control: Analysis"""
plt.plot(gains) # shouldn't really decrease, since policy doesn't change
#plt.set_title('Gain')
plt.show()

if 0:
    #print 'Learner (SARSA)'
    for s in test_mdp.state_space():
        for a in test_mdp.action_space(s):
            print 'Q(' + str(s) + ',' + str(a) + ') = l', learner.Q[s,a], 'w', watcher.Q[s,a]
    #print '\nWatcher (Q-learner)'
    #for s,a in watcher.Q.keys():
    #    print 'Q(' + str(s) + ',' + str(a) + ') = ' + str(watcher.Q[s,a])
else:
    l_policy = learner.getPolicy(epsilon=0)
    w_policy = watcher.getPolicy()
    for s in test_mdp.state_space():
        actions = test_mdp.action_space(s)
        if len(actions):
            l_a = l_policy.decide(s, actions)
            w_a = w_policy.decide(s, actions)
            print 'state',s,'options', actions, 'l:', l_a, 'w:', w_a
            if l_a != w_a:
                print '\tlearner:', [learner.Q[s,a] for a in actions]
                print '\twatcher', [watcher.Q[s,a] for a in actions]

<IPython.core.display.Javascript object>

TypeError: __chooseAction() takes exactly 2 arguments (3 given)

In [23]:
"""Algorithm Comparison - MSE"""
test_mdp = GridMDP(gridName=None)
solver = ValueIterationSolver(test_mdp)
predictors = [MC(), TD(), TD_lambda()]
controllers = [SARSA(), SARSA_lambda()]
watchers = [Q_learning()]

In [2]:
"""Function Approximation - based on TD but now in continuous state space using a NN to approximate Q function with discrete action space,
Action out framework - so output is expected value of each possible action"""
def net_maker(actions, activator=None):
    if activator:
        return nn.Sequential(
            nn.Linear(2, 20),
            activator(),
            nn.Linear(20, 50),
            activator(),
            nn.Linear(50, 20),
            activator(),
            nn.Linear(20, len(actions)),
            )
    return nn.Sequential(
            nn.Linear(2, 20),
            nn.Linear(20, 50),
            nn.Linear(50, 20),
            nn.Linear(20, len(actions)), # action out
            )

class TD_net:
    def __init__(self, actions, policy=None, activator=None, alpha=0.001, discount=0.9, epsilon=0.1):
        self.discount = discount
        self.epsilon = epsilon
        self.explore = lambda e: e > np.random.rand() # epsilon-greedy - constant epsilon
        self.actions = actions # total action space
        self.model = net_maker(actions, activator)
        self.optimizer = optim.SGD(self.model.parameters(), lr=alpha, momentum=0.9)
        self.criterion = nn.MSELoss()
        self.state = None
        self.iter_counter = 0
        self.policy = policy
        if self.policy is None:
            self.policy = self.getPolicy()
    
    def getPolicy(self, epsilon=None):
        if epsilon is not None:
            self.epsilon = epsilon
        self.policy = Policy(decision=self._chooseAction)
        return self.policy
    
    def sample_actions(self, expectations, actions):
        expectations -= expectations.min() # guarantee expectations are positive, also makes it impossible to pick worst action
        expectations /= np.sum(expectations) # l1 normalization
        options = zip(actions, np.cumsum(expectations))
        sample = np.random.sample()
        try:
            action = [o[0] for o in options if sample < o[1]]
            if len(action):
                action = action[-1]
            else:
                action = options[0][0]
        except:
            print 'expectations', np.sum(expectations), expectations, options, sample
            raise Exception('Sampling failed')
        return action
    
    def _chooseAction(self, state):
        try:
            expectations = self.copy_model(state).data.numpy()
            if self.explore(self.epsilon):
                return self.sample_actions(expectations, actions)
            # exploit
            return actions[expectations.argmax()] # greedy
        except:
            return None
    
    def take_action(self, actions):
        if self.state is None: # initial move
            action = np.random.choice(actions)
            return action
        if len(actions) == 1:
            return actions[0]
        expectations = self.copy_model(self.state).data.numpy()
        if self.explore(self.epsilon):
            return self.sample_actions(expectations, actions)
        # exploit
        return actions[expectations.argmax()] # greedy
        #return self.policy.decide(self.state, actions)
    
    def eval_action(self, newstate, reward):
        if self.iter_counter % 200 == 0:
            self.copy_model = copy.deepcopy(self.model) # update frozen net
            #print 'cloned'
        self.iter_counter += 1
        newstate = Variable(torch.Tensor(newstate))
        if self.state is None: # initial move
            self.state = newstate
            return 0
        target = Variable(reward + self.discount*self.copy_model(newstate).data) # get target from frozen net
        self.optimizer.zero_grad()
        loss = self.criterion(self.model(self.state), target)
        loss.backward()
        self.optimizer.step()
        self.state = newstate
        return loss.data[0]
    
    def reset_state(self): # reset at the end of an episode
        #self.E = SafeDict()
        self.state = None

In [3]:
gridWorlds.keys()

['bridge', 'discount', 'book', 'maze', 'cliff2', 'cliff']

In [9]:
"""Run Function Approximation Control
Loss seems to be blowing up - not sure why, but I'm probably using pytorch incorrectly.
"""
test_mdp = GridMDP(gridName='bridge')
print test_mdp.gridName
learner = TD_net(np.array(test_mdp.action_space()), alpha=0.01,epsilon=0.2)# nn.PReLU)
gains = []
losses = []

bridge


In [10]:
episodes = 1000
printstep = episodes / 30
steps = 100
for i in range(episodes): # number of episodes
    test_mdp.reset()
    learner.reset_state()
    if i and i % printstep == 0: print 'Epsiode', i, 
    gain, total_loss, episode_len = 0, 0, 0
    state = None
    for _ in range(steps): # steps per episode
        action = learner.take_action(test_mdp.action_space())
        #print 's:', test_mdp.state, 'a:', action
        state, reward, done = test_mdp.step(action)
        
        state = np.array(state)[np.newaxis,:]
        loss = learner.eval_action(state, reward)
        #print loss
        gain += reward
        total_loss += loss
        episode_len += 1
        if loss > 1000:
            print '***Terminating episode, loss:', loss
            break
        if np.isnan(loss): 
            raise Exception('Loss is nan')
        if done: break
    avg_loss = total_loss / episode_len
    if i and i % printstep == 0: print '- gain:', gain, '- avg loss:', avg_loss
    gains.append(gain)
    losses.append(avg_loss)



Epsiode 33 - gain: -9.9 - avg loss: 0.000925208327991
Epsiode 66 - gain: -9.9 - avg loss: 0.00122125644339
Epsiode 99 - gain: -9.9 - avg loss: 0.00172355779257
Epsiode 132 - gain: -9.9 - avg loss: 0.00296909575203
Epsiode 165 - gain: -11.0 - avg loss: 0.00366609148505
Epsiode 198 - gain: -11.0 - avg loss: 0.000402217380794
Epsiode 231 - gain: -11.0 - avg loss: 0.00137916773973
Epsiode 264 - gain: -11.0 - avg loss: 0.00505472680599
Epsiode 297 - gain: -11.0 - avg loss: 0.00290457651005
Epsiode 330 - gain: -11.0 - avg loss: 0.00183775372722
Epsiode 363 - gain: -9.9 - avg loss: 0.00326481702151
Epsiode 396 - gain: -9.9 - avg loss: 0.00123425291911
Epsiode 429 - gain: -11.0 - avg loss: 0.000794713857608
Epsiode 462 - gain: -11.0 - avg loss: 0.00104373812096
Epsiode 495 - gain: -11.0 - avg loss: 0.000162536794791
Epsiode 528 - gain: -9.9 - avg loss: 0.000724639365442
Epsiode 561 - gain: -9.9 - avg loss: 0.00255153407982
Epsiode 594 - gain: -11.0 - avg loss: 0.000713429846204
Epsiode 627 - g

In [11]:
test_mdp.reset()
actions = test_mdp.action_space()
print actions
for s in test_mdp.state_space():
    if s == 'TERMINAL_STATE':
        s = (-1,-1)
    q = learner.model(Variable(torch.Tensor(np.array(s)[np.newaxis,:]))).data.numpy()[0]
    print s, ':', actions[q.argmax()], q

fig, ax = plt.subplots()
#plt.plot(gains, 'g') # shouldn't really decrease, since policy doesn't change
ax.plot(losses, 'r')
ax.set_xlabel('Episodes')
ax.set_ylabel('Avg Loss')
#plt.set_title('Gain')
plt.show()

('north', 'west', 'south', 'east')
(-1, -1) : west [-1.00002921 -1.0000248  -1.00002837 -1.00002658]
(0, 1) : east [-1.3985219  -1.39914322 -1.39900744 -1.39836359]
(1, 0) : west [-1.52545154 -1.52520812 -1.52570045 -1.52600408]
(1, 1) : north [-1.61597252 -1.61622596 -1.61646247 -1.61623621]
(1, 2) : east [-1.70649362 -1.7072438  -1.70722461 -1.70646846]
(2, 0) : west [-1.74290216 -1.74229085 -1.74315548 -1.7438767 ]
(2, 1) : west [-1.83342314 -1.8333087  -1.83391762 -1.83410883]
(2, 2) : north [-1.92394423 -1.92432654 -1.92467964 -1.9243412 ]
(3, 0) : west [-1.96035266 -1.95937359 -1.96061063 -1.96174932]
(3, 1) : west [-2.05087376 -2.05039144 -2.05137253 -2.05198145]
(3, 2) : north [-2.14139485 -2.14140916 -2.14213467 -2.14221382]
(4, 0) : west [-2.17780352 -2.17645621 -2.17806578 -2.17962193]
(4, 1) : west [-2.26832438 -2.26747417 -2.26882768 -2.26985407]
(4, 2) : west [-2.35884547 -2.3584919  -2.35958958 -2.36008644]
(5, 0) : west [-2.39525414 -2.39353895 -2.39552069 -2.39749479]


<IPython.core.display.Javascript object>

In [None]:
inits = np.vstack(initial_states)
losses = np.array([0] + losses)[:,np.newaxis]
print inits.shape[0], 'episodes'

In [None]:
stats = np.hstack([inits, losses])
print stats