In [1]:
import sys, os
import numpy as np
import itertools, random, time
from Bresenham import getPath
from utilities import getTransitions, getVPolicyFromQ, readTrackFile
from IPython.display import display, clear_output

In [2]:
accelList = [1,0,-1]
velocList = range(-5,6)
accels = list(itertools.product(accelList,accelList))
velocs = list(itertools.product(velocList,velocList))
actsPrint = {(0,0): "o", (1,1): "\N{North East Arrow}", 
             (1,0): "\N{Rightwards Arrow}", (1,-1): "\N{South East Arrow}",
             (0,-1): "\N{Downwards Arrow}", (-1,-1): "\N{South West Arrow}",
             (-1,0): "\N{Leftwards Arrow}", (-1,1): "\N{North West Arrow}",
             (0,1): "\N{Upwards Arrow}"}

In [3]:
fPath = os.path.join('data','O-track.txt')
O_states,O_goals,O_track,O_starts = readTrackFile(fPath, velocs)
O_TRs = getTransitions(O_states, accels, O_track, O_starts) # all trans from all states
O_TRs_c = getTransitions(O_states, accels, O_track, O_starts,hardCrash=True) # all trans from all states

In [10]:
def calcQfromV(state, tr, V_0, failPr, gamma):
    Q = {k:0 for k in tr} # initialize Q to all 0 for all actions
    failSt = tr[(0,0)][0] # resultant state of a failed acceleration
    
    for accel,(newState,reward) in tr.items(): # loop over all transitions
        tmp = (1-failPr)*V_0[newState] + failPr*V_0[failSt] # sum of pr*V
        Q[accel] = reward + gamma * tmp # E[r|s,a] + gamma * sum(V)
    return Q

def maxDiffTwoVs(v1, v2):
    pairedVals = zip(v1.values(), v2.values()) # get all values from both V vals
    return max([abs(x-y) for x,y in pairedVals]) # return max diff of all pairs

In [11]:
################################################################################
def valueIteration(states, TRs_all, gamma=0.9, eps=1e-9, pr_fail=0.2, trace=False):
    Vs = {k:0 for k in states}
    Qs = {k:None for k in states}
    pols = dict()
    
    for t in itertools.count(): # loop until converged
        Vs_old = Vs.copy() # copy of Vs as old V values
        for st in states: # loop over all states
            Qs[st] = calcQfromV(st, TRs_all[st], Vs_old, pr_fail, gamma)
            Vs[st],pols[st] = getVPolicyFromQ(Qs[st])
        
        maxDiff = maxDiffTwoVs(Vs, Vs_old)
        if trace and (t % 10 == 0):
            print('[%05d]: diff=%f'%(t,maxDiff))
        
        if (maxDiff<eps) or (t >= 1e4): # max 1000 iters if not converged
            break

    if trace:
        print('Total iters: %d, max util diff = %f'%(t,maxDiff))
    return pols, t, maxDiff

In [None]:
pol_VI,b,c = valueIteration(O_states, O_TRs, trace=True)

In [95]:
def epsGreedy(Q, acts, temp=0.5):
    qs = np.array( [Q[k] for k in acts] ) # all Q(s,a) values
    P_a = np.exp(qs / temp) # numerator of softmax, exp[Q(s,a)]
    P_a = P_a / P_a.sum() # array of probabilites
    
    idx = np.argmax(np.random.random() < P_a.cumsum())
    return acts[idx]

def qEpisode(st, Q, TRs, gm, et, prF, trk, trace):
    for t in itertools.count(): # loop for exploration
        if trk[st[1],st[0]] == 'F': # if curr state is a goal
            break

        tr = TRs[st] # all possible transitions from curr state
        temp = max(0.05, 1/np.exp(t/150)**2) # temperature schedule

        attempt = epsGreedy(Q[st], list(tr.keys()), temp) # eps-greedy attemp
        if random.random() < prF: # failed to accelerate
            actual = (0,0) # fail to accelerate
        else: # successfully change accelerattion
            actual = attempt 
            
        newSt,reward = tr[actual] # next state and reward for action
        maxQ = max( Q[newSt][k] for k in TRs[newSt].keys() ) # Q of new state
        Q[st][actual] += et*(reward + gm*maxQ - Q[st][actual]) # update Q(s,a)
        st = newSt

    return Q, t # return Q values and number of steps

In [109]:
################################################################################
def qLearning(states, accels, track, TRs, nEpisodes=100000, gamma=0.9, eta=0.1,
              pr_fail=0.2, trace=None):
    states = list(states)
    #states = [x for x in states if track[x[1],x[0]]!='F'] # remove goal states
    
    Qs = dict()
    for st in states: # initialize Q table to all 0's
        Qs[st] = {a: 0 for a in accels}
        
    epsLen = np.zeros(nEpisodes, int) # length of each episode
    for ep in range(nEpisodes):
        start = random.choice(states) # choose random starting location
        # run one episode of q-learning, get new Qs, ep. len, ep. cum. reward
        Qs,epsLen[ep] = qEpisode(start, Qs, TRs, gamma, eta, pr_fail,
                                 track, trace)
        if trace and (ep%trace)==0:
            print('[%05d] Episode len = %d'%(ep,epsLen[ep]))
    if trace:
        print('[%05d] Last episode, len = %d'%(ep,epsLen[ep]))
    
    policy = dict() # pre-allocate policy
    for st in states: # loop over all states to get policy for each state
        tmp, policy[st] = getVPolicyFromQ(Qs[st]) # best policy accord. to Qs
    return policy, epsLen, Qs # return policy and stats

In [111]:
pol,ll,qq = qLearning(O_states, accels, O_track, O_TRs, nEpisodes=10000, trace=1000)

[00000] Episode len = 1308
[01000] Episode len = 381
[02000] Episode len = 218
[03000] Episode len = 457
[04000] Episode len = 0
[05000] Episode len = 319
[06000] Episode len = 327
[07000] Episode len = 421
[08000] Episode len = 336
[09000] Episode len = 16
[09999] Last episode, len = 393


In [102]:
aa = np.exp(np.array([qq[(20, 17, 0, 0)][k] for k in accels]) /0.05)
(aa/aa.sum()).max()

0.5196386564674706

In [None]:
a = TRss[(21,17,0,-3)]

In [None]:
TRss = getTransitions(O_states,accels,O_track)

z = transFromState((2,15,1,-2),accels,O_track)
list(z.items())[0]

In [None]:
random.sample(O_starts,1)

In [165]:
def trackToStr(track, state):
    trk = track.copy()
    trk[state[1],state[0]] = 'X'
    s = '\n'.join([''.join(l) for l in trk]) + '\n'
    s += 'x-vel=%d, y-vel=%d\n'%state[2:]
    return s

################################################################################
def simulateRace(trk, TRs, starts, pol, p_fail=0.2, trace=False):
    stepLimit = np.prod(trk.shape)*2/(1-p_fail) # step limit for non-viability
    
    if len(starts) > 1:
        startPt = random.choice(starts) # choose starting point randomly
    if len(startPt) != 4: # only location provided
        st = startPt + (0,0) # set starting state, 0 velocity
    viable = False
    for t in itertools.count(): # counter for total distance
        if trace:
            clear_output(wait=True)
            print(trackToStr(trk,st))
            time.sleep(0.5)
        
        if trk[st[1],st[0]] == 'F': # if curr state is a goal
            viable = True
            break
        if t > stepLimit: # too many steps, not viable policy
            break
        
        if random.random() < p_fail: # failed to accelerate
            accel = (0,0)
        else:
            accel = pol[st] # optimal accel according to policy
            
        st = TRs[st][accel][0] # set new state to result of transition
    return t, startPt, viable # return steps taken, start pt, and viability

In [167]:
a = np.zeros(len(pol), int)
b = np.zeros(len(pol), bool)

for n,st in enumerate(pol.keys()):
    a[n],tmp,b[n] = simulateRace(O_track, O_TRs, st, pol)

In [None]:
simulateRace(O_track, O_TRs, O_starts, pol, p_fail=0.2, trace=True);

In [193]:
a = np.array([0.2, 5.4])
tuple(a)

(0.2, 5.4)