In [1]:
import numpy as np
import math
from matplotlib import pyplot as plt
from random import choices
S = 200 # number of times repeat the task
M = 1 # simulation average
T = 20 # one episode length 
alpha = 0.5
eps = 0.05
gamma = 0.95
tau = 50

mu_low = 16
mu_high = 8
sigma = 0.5
r = 10
# reward dimension is 0
# threat dimension is 1
def linear_scalar (Q_rew, Q_thr, curr_state, w, epsilon):
    temp = Q_rew[curr_state,:]*w[0] + Q_thr[curr_state,:]*w[1]
    if np.random.uniform(0,1) < epsilon:
        action = np.random.randint(0,2)
    else:
        action = np.random.choice(np.array(np.where(temp == np.amax(temp))).flatten())
    return action
# multiply might be in the psychiatry literature
# 
def softmax_one (Q_rew, Q_thr, curr_state, w, tau):
    temp = (Q_rew[curr_state,:]*w[0] + Q_thr[curr_state,:]*w[1])/tau 
    ex = np.exp(temp - np.max(temp))
    weights = ex/np.sum(ex)
    #print(weights)
    population = np.arange(np.size(Q_rew,1))
    
    action = choices(population, weights)
    return action[0]
    
def update(curr_action,curr_state, Q, next_action, next_state, r):
    # next_action is always greedy action when we do Q-learning
    predict = Q[curr_state, curr_action]
    target = r+ gamma*Q[next_state, next_action]
    Q[curr_state, curr_action] = Q[curr_state, curr_action]*(1-alpha) + alpha*target
    #print("reward = ", r)

# action 0 is to transition
# action 1 is to cash out
def step(curr_state, curr_action, t, p):
    pop = False
    cashout = False
    if t < p:
        # balloon has not popped
        if curr_action == 1:
            next_state = 0
            reward = 0
            threat = 0
            cashout = True
        else:
            # action = 0, so decided to pump
            next_state = curr_state + 1
            reward = 100
            threat = 0
    else:
        # balloon popped 
        next_state = 0
        reward = 0
        threat = -t-1
        pop = True
    return reward, threat, next_state,pop, cashout
# action 0 is to transition
# action 1 is to cash out
def stepp(curr_state, curr_action, t):
    if curr_action == 0:
        if curr_state < T - 1:
            next_state = curr_state+1
            reward = r
        else:
            next_state = 0
            reward = r
    else:
        next_state = 0
        reward = 0
    
    return reward, next_state

#-------Training starts here---------
# partition parameter
p = 0.10
W = int(1/p) + 1


Q_rew = np.zeros((20,2,W, M))
Q_thr = np.zeros((20,2,W, M))
cashout_time = np.zeros((W, M))
for w in range(W):
    weight = [w*p, 1.0-w*p]
    for m in range(M):
        cashout_time_temp = np.zeros(S)
        for s in range(S):
            p = np.random.normal(mu_low, sigma)
            #print("bad probability = ", p)
            pop = False
            cashout = False
            threat = 0
            curr_state = 0
            for t in range(T):
                curr_action = softmax_one(Q_rew[:,:,w,m], Q_thr[:,:,w,m], curr_state,weight, tau)
                #print(curr_action)
                reward, next_state = stepp(curr_state, curr_action,t)
                #print(next_state)
                next_action = linear_scalar(Q_rew[:,:,w,m], Q_thr[:,:,w,m], next_state, weight, 0.0)
                if curr_state > p:
                    pop = True
                    threat = - t-1
                    next_state = 0
                    reward = 0
                    cashout_time_temp[s] = t
                    update(curr_action, curr_state, Q_rew[:,:,w,m], next_action, next_state, reward)
                    update(curr_action, curr_state, Q_thr[:,:,w,m], next_action, next_state, threat)
                    break
                elif curr_action == 1:
                    cashout = True
                    cashout_time_temp[s] = t
                    #print("cashout time = ", t+1, "w = ", w)
                    next_state = 0
                    reward = 0
                    update(curr_action, curr_state, Q_rew[:,:,w,m], next_action, next_state, reward)
                    update(curr_action, curr_state, Q_thr[:,:,w,m], next_action, next_state, threat)
                    break
                else:
                    #print("reward = ", reward, "next_state = ", next_state)
                    #next_action = linear_scalar(Q_rew[:,:,w,m], Q_thr[:,:,w,m], next_state, weight, 0.0)
                    curr_state = next_state 

                update(curr_action, curr_state, Q_rew[:,:,w,m], next_action, next_state, reward)
                update(curr_action, curr_state, Q_thr[:,:,w,m], next_action, next_state, threat)
#         print("w = ", w)
#         print(Q_rew[:,:,w,m])
#         print(Q_thr[:,:,w,m])
#         print(cashout_time_temp)
        a = cashout_time_temp[cashout_time_temp!=0]
#         print(a)
        L = np.size(a)
        temp = a[L-10:]
        #print(temp)
        cashout_time[w,m] = np.sum(temp)/np.size(temp)
        #print("--------------------------------------------")



In [2]:
print(cashout_time)
print(np.mean(cashout_time, 1))
#print(np.mean(cashout_time, 0))

[[ 2. ]
 [16.5]
 [16.4]
 [16.4]
 [16.6]
 [16.4]
 [16.8]
 [16.3]
 [16.6]
 [16.7]
 [16.4]]
[ 2.  16.5 16.4 16.4 16.6 16.4 16.8 16.3 16.6 16.7 16.4]
