In [11]:
import numpy as np
import numpy.random as nr
import itertools
import scipy.misc
import matplotlib.pyplot as plt
import pandas as pd
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})

from mdp_class import MDP
from q_agent_class import Qagent
import matplotlib.animation as animation



In [9]:
n_states = 4 # add one for terminal state
n_actions_max = 2

# build T and R for a simple 2 stage choice task

Tsas = np.zeros([n_states,n_actions_max,n_states]) 
avail_actions = np.array([[0,1],[2,3],[4,5],[]])
next_states = np.array([1,2,3,3,3,3])
n_total_actions = 6
T_2d = np.zeros([n_total_actions, n_states])
for i in np.arange(n_total_actions):
    T_2d[i,next_states[i]] = 1
    
for s in np.arange(n_states):
    this_state_options = avail_actions[s]
    this_state_probs = T_2d[this_state_options,:]
    Tsas[s,0:np.size(this_state_options),:] = this_state_probs

n_actions = np.array([2,2,2,0])

#exmdp = MDP(n_states,n_actions,Rsa,Tsas,terminal_states)



In [10]:
avail_actions

array([list([0, 1]), list([2, 3]), list([4, 5]), list([])], dtype=object)

In [3]:
# effect of which Q values

# low beta
Rsa = np.array([[0,0], [-10,5], [2,3],[0,0]])
params = {'beta': .1, 'alpha_q': 1, 'gamma': 1}

qag = Qagent(params, n_states, n_actions, Tsas = Tsas, Rsa = Rsa)

print('beta: ', params['beta'])

print('\nQhat pre: \n',qag.Q_hat)

gain_ss = qag.comp_gain(which_Q_new = 'single_step')
print('\n single step gain :\n', gain_ss)

gain_full = qag.comp_gain(which_Q_new = 'full')
print('\n full gain :\n', gain_full)


beta:  0.1

Qhat pre: 
 [[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]

 single step gain :
 [[0.00 0.00]
 [2.31 0.61]
 [0.10 0.22]
 [0.00 0.00]]

 full gain :
 [[0.00 0.00]
 [3.47 1.84]
 [-0.05 0.07]
 [0.00 0.00]]


In [4]:
# high beta
Rsa = np.array([[0,0], [-10,5], [2,3],[0,0]])
params = {'beta': 10, 'alpha_q': 1, 'gamma': 1}
qag = Qagent(params, n_states, n_actions, avail_actions, Tsas = Tsas, Rsa = Rsa)

print('beta: ', params['beta'])

print('\nQhat pre: \n',qag.Q_hat)

gain_ss = qag.comp_gain(which_Q_new = 'single_step')
print('\n single step gain :\n', gain_ss)

gain_full = qag.comp_gain(which_Q_new = 'full')
print('\n full gain :\n', gain_full)

beta:  10

Qhat pre: 
 [[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]

 single step gain :
 [[0.00 0.00]
 [5.00 2.50]
 [1.00 1.50]
 [0.00 0.00]]

 full gain :
 [[0.00 0.00]
 [7.50 7.50]
 [-0.50 0.50]
 [0.00 0.00]]


In [7]:
# adding rewards sequentially
Rsa = np.array([[0,0], [0,0], [0,0],[0,0]])
params = {'beta': .1, 'alpha_q': 1, 'gamma': 1}
qag = Qagent(params, n_states, n_actions, avail_actions, Tsas = Tsas, Rsa = Rsa)
gain_ss = qag.comp_gain(which_Q_new = 'single_step')

print('\n Q_hat :\n', qag.Q_hat)
print('\n policy: \n', qag.comp_pi(qag.Q_hat))
print('\n single step gain :\n', gain_ss)

qag.set_Rsa(1,0,-1000)
print('R \n', qag.Rsa)
gain_ss = qag.comp_gain(which_Q_new = 'single_step')
print('\n single step gain :\n', gain_ss)

(s,a) = np.unravel_index(np.argmax(gain_ss, axis=None), gain_ss.shape)
print('\n applying backup to max gain action:\n' ,s,a)
qag.backup_Q(s,a,reset=True)
print('\n Q_hat :\n', qag.Q_hat)
print('\n policy: \n', qag.comp_pi(qag.Q_hat))

gain_ss = qag.comp_gain(which_Q_new = 'single_step')
print('\n gain :\n', gain_ss)

qag.set_Rsa(1,1,5)
print('\n adding in new R \n', qag.Rsa)

gain_ss = qag.comp_gain(which_Q_new = 'single_step')
print('\n gain :\n', gain_ss)



 Q_hat :
 [[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]

 policy: 
 [[0.50 0.50]
 [0.50 0.50]
 [0.50 0.50]
 [0.00 0.00]]

 single step gain :
 [[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]
R 
 [[    0     0]
 [-1000     0]
 [    0     0]
 [    0     0]]

 single step gain :
 [[0.00 0.00]
 [500.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]

 applying backup to max gain action:
 1 0

 Q_hat :
 [[0.00 0.00]
 [-1000.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]

 policy: 
 [[0.50 0.50]
 [0.00 1.00]
 [0.50 0.50]
 [0.00 0.00]]

 gain :
 [[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]

 adding in new R 
 [[    0     0]
 [-1000     5]
 [    0     0]
 [    0     0]]

 gain :
 [[0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]
 [0.00 0.00]]


In [None]:
qag = Qagent(params, n_states, n_actions, avail_actions, Tsas = Tsas, Rsas = Rsas)
choices = ['L', 'R']

n_episodes = 3
env = exmdp
ag = qag
max_step = 10

for i in range(n_episodes):
    #print ('trial: ', i)
    
    d = False
    j = 0
    S = env.reset(env.start_state)

    while j < max_step:
            
        # increase counter
        j += 1
        
        # sample action given by pi for state S
        a = qag.sample_action(S)
        
        # take action A, observe s1, r, terminal?
        S_prime,r,nchoices,d = env.step(a)
        
        #print(ag.Q_hat)
        #print('S:',S, 'a:',choices[a],'Sp',S_prime,'r', r)
        
        # update model
        ag.update_Qlearn(S,a,r,S_prime)
        

        # update S
        S = S_prime;
            
        if d == True:
            break
            


In [None]:
def sim_episode(env, choice_vec, max_step):

    d = False
    j = 0
    S = env.reset(env.start_state)

    while j < max_step:
            
        # increase counter
        j += 1
        
        # sample action given by pi for state S
        a = choice_vec[j-1]
        
        # take action A, observe s1, r, terminal?
        S_prime,r,nchoices,d = env.step(a)
        
        print(S,a,r,S_prime)

        # update S
        S = S_prime;
            
        if d == True:
            break
            
    return j

In [None]:
exmdp = MDP(n_states,n_actions,Rsas,Tsas,terminal_states)
a = 0
(s,r,nchoices,done) = exmdp.step(a)
print(s,r,nchoices,done)
a = 0
(s,r,nchoices,done) = exmdp.step(a)
print(s,r,nchoices,done)
a = 0
(s,r,nchoices,done) = exmdp.step(a)
print(s,r,nchoices,done)