In [54]:
import numpy as np
import numpy.random as nr
import itertools
import scipy.misc
import matplotlib.pyplot as plt
import pandas as pd
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})


from mdp_class import MDP


In [187]:
class Qagent():
    def __init__(self,params,n_states, n_actions, available_actions, Tsas = [], Rsas = [], Rs = []):
        n_actions_max = np.max(n_actions)
        n_state_actions = np.sum(n_actions)
        
        self.n_actions_max = n_actions_max # max number of actions in a single state, scalar
        self.n_actions = n_actions # n actions in each state, a list
        self.available_actions = available_actions # which actions are available in each state
        self.n_state_actions = np.sum(n_actions)
        
        self.params = params
        self.n_states = n_states
        self.sa_list = [(s,j) for s in np.arange(n_states) for j in range(n_actions[s])] # list of state,action pairs
        
        self.Q_hat = np.zeros([n_states, n_actions_max])
        self.e = np.zeros(self.n_state_actions) # traces for each state (want this to be for each action , deal with later)
        self.S = 'S'
        self.S_prime = 'S'
        
        self.Tsas = Tsas
        self.Rsa = Rsas[:,:,0]
        self.Rs = Rs
    
    def reset(self):
        self.Q_hat = np.zeros([self.n_states, self.n_actions_max])
        self.S = 'S'
        self.S_prime = 'S'
        
    def comp_choice_probs_SM(self,s, Q_hat = []): 
        n_choices_s = self.n_actions[s]
        # apply softmax decision rule to the state
        choice_probs = np.zeros([self.n_actions_max])
        if len(Q_hat) == 0:
            Q_hat = self.Q_hat.copy()
     #   import pdb; pdb.set_trace() 
        Q_hat_s = Q_hat[s,0:n_choices_s]
        real_choice_probs = np.exp(self.params['beta']*Q_hat_s)/np.sum(np.exp(self.params['beta']*Q_hat_s))
        choice_probs[0:n_choices_s] = real_choice_probs
        return choice_probs
        
    def sample_action(self,s, Q_hat = []): # can add later what choice rule to use
        # sample an action under the current Q value function
        # build Q values for each action
       # import pdb; pdb.set_trace() 
        choice_probs = self.comp_choice_probs_SM(s)
        return nr.choice(np.arange(len(choice_probs)), p = choice_probs)
    
    def update_Qlearn(self, s, a, r, s_prime, Q_hat = [], reset = True):
        # try varying - do expected sarsa backup
        # 
        if len(Q_hat) == 0:
            Q_hat_new = self.Q_hat.copy()
        else:
            Q_hat_new = Q_hat.copy()
            
        #if s_prime == 7:
            #mport pdb; pdb.set_trace()
            
        if self.n_actions[s_prime] == 0:
            # terminal state
            target = self.params['gamma']*r
        else:
            target = self.params['gamma']*(r + np.max(Q_hat_new[s_prime,0:self.n_actions[s_prime]]))
            
        Q_hat_new[s,a] = (1 - self.params['alpha_q'])*Q_hat_new[s,a] + self.params['alpha_q']*target
        
        if reset:
            self.Q_hat = Q_hat_new
            
        return Q_hat_new
    
    def backup_Q(self,s,a, Q_hat = [], reset = True):
        if len(Q_hat) == 0:
            Q_hat_new = self.Q_hat.copy()
        else:
            Q_hat_new = Q_hat.copy()
        # does a single value iteration / bellman backup to Q
        V_hat = [np.max(Q_hat_new[s_prime,0:self.n_actions[s_prime]]) if self.n_actions[s_prime] > 0 else 0 for s_prime in np.arange(self.n_states)]
        Q_hat_new[s,a] = self.Rsa[s,a] + np.dot(self.Tsas[s,a,:],V_hat)
        
        if reset:
            self.Q_hat = Q_hat_new
        
    def comp_pi(self, Q_hat = []):
        if len(Q_hat) == 0:
            Q_hat = self.Q_hat.copy()
        pol_mtx = np.array([self.comp_choice_probs_SM(s,Q_hat) for s in range(self.n_states)])
        return pol_mtx
    
    def comp_Tss(self, pol_mtx, Tsas):
        return np.array([np.dot(pol_mtx[s,:],Tsas[s,::]) for s in np.arange(self.n_states)])
        
    def comp_SR(self,pol_mtx,Tss):
        return np.linalg.inv(np.eye(self.n_states) - self.params['gamma']*Tss)
    
    
    
    
        


In [188]:
n_states = 8
n_actions_max = 2


# build T and R for a simple 2 stage choice task

Tsas = np.zeros([n_states,n_actions_max,n_states]) 

avail_actions = np.array([[0,1],[2,3],[4,5],[6],[7],[8],[9],[]])
next_states = np.array([1,2,3,4,5,6,7,7,7,7])
n_total_actions = 10

T_2d = np.zeros([n_total_actions, n_states])
for i in np.arange(n_total_actions):
    T_2d[i,next_states[i]] = 1
    
for s in np.arange(n_states):
    this_state_options = avail_actions[s]
    this_state_probs = T_2d[this_state_options,:]
    Tsas[s,0:np.size(this_state_options),:] = this_state_probs
    

# number of available actions per state
n_actions = np.array([2,2,2,1,1,1,1,0])

Rs = np.array([0,0,0,-10, 5, 3, 2,0])
Rsas = np.zeros([n_states,n_actions_max,n_states])

# get the reward when you leave the state
for s in np.arange(n_states):
    Rsas[s,:,:] = Rs[s]

terminal_states = np.array([7])

exmdp = MDP(n_states,n_actions,Rsas,Tsas,terminal_states)

params = {'beta': .1, 'alpha_q': 1, 'gamma': 1}

qag = Qagent(params, n_states, n_actions, avail_actions, Tsas = Tsas, Rsas = Rsas)


print(qag.Rsa)


[[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [-10.00 -10.00]
 [5.00 5.00]
 [3.00 3.00]
 [2.00 2.00]
 [0.00 0.00]]


In [196]:
qag = Qagent(params, n_states, n_actions, avail_actions, Tsas = Tsas, Rsas = Rsas)
# test backups
print(qag.Q_hat)
qag.backup_Q(3,0)
print(qag.Q_hat)
qag.backup_Q(1,0)
print(qag.Q_hat)

[[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]
[[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [-10.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]
[[0.00 0.00]
 [-10.00 0.00]
 [0.00 0.00]
 [-10.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]


In [186]:
qag.Q_hat

0.0

In [131]:
qag = Qagent(params, n_states, n_actions, avail_actions)
choices = ['L', 'R']

n_episodes = 50
env = exmdp
ag = qag
max_step = 10

for i in range(n_episodes):
    #print ('trial: ', i)
    
    d = False
    j = 0
    S = env.reset(env.start_state)

    while j < max_step:
            
        # increase counter
        j += 1
        
        # sample action given by pi for state S
        a = qag.sample_action(S)
        
        # take action A, observe s1, r, terminal?
        S_prime,r,nchoices,d = env.step(a)
        
        #print(ag.Q_hat)
        #print('S:',S, 'a:',choices[a],'Sp',S_prime,'r', r)
        
        # update model
        ag.update_Qlearn(S,a,r,S_prime)
        

        # update S
        S = S_prime;
            
        if d == True:
            break
            


In [123]:
print(ag.Q_hat)

pol_mtx = np.array([ag.comp_choice_probs_SM(s,ag.Q_hat) for s in range(ag.n_states)])

print(pol_mtx)

ag.comp_pi(pol_mtx)

[[5.00 3.00]
 [-10.00 5.00]
 [3.00 2.00]
 [-10.00 0.00]
 [5.00 0.00]
 [3.00 0.00]
 [2.00 0.00]
 [0.00 0.00]]
[[0.55 0.45]
 [0.18 0.82]
 [0.52 0.48]
 [1.00 0.00]
 [1.00 0.00]
 [1.00 0.00]
 [1.00 0.00]
 [0.00 0.00]]


array([[0.50, 0.50],
       [0.48, 0.52],
       [0.50, 0.50],
       [1.00, 0.00],
       [1.00, 0.00],
       [1.00, 0.00],
       [1.00, 0.00],
       [0.00, 0.00]])

In [None]:
ag.Q_hat

In [124]:
def sim_episode(env, choice_vec, max_step):

    d = False
    j = 0
    S = env.reset(env.start_state)

    while j < max_step:
            
        # increase counter
        j += 1
        
        # sample action given by pi for state S
        a = choice_vec[j-1]
        
        # take action A, observe s1, r, terminal?
        S_prime,r,nchoices,d = env.step(a)
        
        print(S,a,r,S_prime)

        # update S
        S = S_prime;
            
        if d == True:
            break
            
    return j

In [None]:
exmdp = MDP(n_states,n_actions,Rsas,Tsas,terminal_states)
a = 0
(s,r,nchoices,done) = exmdp.step(a)
print(s,r,nchoices,done)
a = 0
(s,r,nchoices,done) = exmdp.step(a)
print(s,r,nchoices,done)
a = 0
(s,r,nchoices,done) = exmdp.step(a)
print(s,r,nchoices,done)

In [None]:
from graphviz import Digraph

In [None]:
dot = Digraph(comment='The Round Table')
dot  #doctest: +ELLIPSIS