In [84]:
import sys, os
import numpy as np
import itertools
from Bresenham import getPath
import random

In [2]:
accelList = [1,0,-1]
velocList = range(-5,6)
accels = list(itertools.product(accelList,accelList))
velocs = list(itertools.product(velocList,velocList))
actsPrint = {(0,0): "o", (1,1): "\N{North East Arrow}", 
             (1,0): "\N{Rightwards Arrow}", (1,-1): "\N{South East Arrow}",
             (0,-1): "\N{Downwards Arrow}", (-1,-1): "\N{South West Arrow}",
             (-1,0): "\N{Leftwards Arrow}", (-1,1): "\N{North West Arrow}",
             (0,1): "\N{Upwards Arrow}"}

In [3]:
################################################################################
def readTrackFile(file):
    with open(file, 'r') as f:
        raw = [[s for s in l.strip()] for l in f.readlines()]
        track = np.array(raw[1:]) # strip out lines with world size

    locs,goals,starts = set(),set(),set()
    for x,y in itertools.product(range(track.shape[0]), range(track.shape[1])):
        space = track[y,x] # current track space
        if space == '#': # wall space, not valid state
            continue
        else:
            locs.add( (x,y) ) # valid state of car
            if space == 'S': # starting points
                starts.add( (x,y) ) # add to list of starting locations
            if space == 'F': # is finish line space
                goals.add( (x,y) ) # add to list of goal locations

    states = set((x,y,a,b) for (x,y),(a,b) in itertools.product(locs,velocs))
    return states,goals,track,starts
    
fPath = os.path.join('data','O-track.txt')
O_states,O_goals,O_track,O_starts = readTrackFile(fPath)

In [106]:
################################################################################
def transFromState(state,actions,track):
    size = track.shape
    x0, y0, vx0, vy0 = state
    trans = dict()
    #trans = list()
    for ax,ay in actions: # loop through all possible accelerations
        vx1 = min(max(vx0 + ax,-5),5) # updated velocity
        vy1 = min(max(vy0 + ay,-5),5)
        paths = getPath(x0, y0, vx1, vy1) # path car takes with velocity vec

        r = -1 # default reward
        x1, y1 = (x0,y0) # set to starting location
        for xC,yC in paths: # x,y coordinate of every step in path
            #if not (0<=xC<size[0]) or not (0<=yC<size[1]): # out of bound
            #    break
            if track[yC,xC] == '#': # run into wall
                vx1, vy1 = (0,0)
                break
            if track[yC,xC] == 'F': # goes over finish line
                x1, y1 = (xC,yC)
                vx1, vy1 = (0,0)
                r = 0
                break
            x1, y1 = (xC,yC) # update location to new place
        #trans.append( ((ax,ay),(x1,y1,vx1,vy1),r) ) # new state after transition
        trans[(ax,ay)] = ((x1,y1,vx1,vy1),r)  # new state after transition
    
    return trans
################################################################################
def getTransitions(states,accelerations,world):
    Rs = dict()
    for st in states:
        if world[st[1],st[0]] == 'F': # is finish line
            immob = (st[0], st[1], 0, 0) # immobilize at curr goal position
            Rs[st] = {(0,0): (immob, 0)} # only valid trans is staying put
        else: # get all transitions from current state if not finish line
            Rs[st] = transFromState(st,accelerations,world)
    return Rs

################################################################################
def getVPolicyFromQ(Q):
    acts = Q.keys()
    pol = max(acts, key=(lambda k: Q[k])) # get the key with highest reward
    return Q[pol], pol

In [34]:
def calcQfromV(state, tr, V_0, failPr, gamma):
    Q = {k:0 for k in tr} # initialize Q to all 0 for all actions
    failSt = tr[(0,0)][0] # resultant state of a failed acceleration
    
    for accel,(newState,reward) in tr.items(): # loop over all transitions
        tmp = (1-failPr)*V_0[newState] + failPr*V_0[failSt] # sum of pr*V
        Q[accel] = reward + gamma * tmp # E[r|s,a] + gamma * sum(V)
    return Q

def maxDiffTwoVs(v1, v2):
    pairedVals = zip(v1.values(), v2.values()) # get all values from both V vals
    return max([abs(x-y) for x,y in pairedVals]) # return max diff of all pairs

In [108]:
################################################################################
def valueIteration(states, track, accels, gamma=0.9, eps=1e-9, pr_fail=0.2, 
                   trace=False):
    TRs_all = getTransitions(states, accels, track) # all trans from all states
    
    Vs = {k:0 for k in states}
    Qs = {k:None for k in states}
    pols = dict()
    
    for t in itertools.count(): # loop until converged
        Vs_old = Vs.copy() # copy of Vs as old V values
        for st in states: # loop over all states
            Qs[st] = calcQfromV(st, TRs_all[st], Vs_old, pr_fail, gamma)
            Vs[st],pols[st] = getVPolicyFromQ(Qs[st], TRs_all[st])
        
        maxDiff = maxDiffTwoVs(Vs, Vs_old)
        if trace and (t % 10 == 0):
            print('[%05d]: diff=%f'%(t,maxDiff))
        
        if (maxDiff<eps) or (t >= 1e4): # max 1000 iters if not converged
            break

    if trace:
        print('Total iters: %d, max util diff = %f'%(t,maxDiff))
    return pols, t, maxDiff

In [109]:
def epsGreedy(Q, acts, temp=1):
    qs = np.array( [Q[k] for k in acts] ) # all Q(s,a) values
    P_a = np.exp(qs / temp) # numerator of softmax, exp[Q(s,a)]
    P_a = P_a / P_a.sum() # array of probabilites
    
    idx = np.argmax(np.random.random() < P_a.cumsum())
    return acts[idx]
################################################################################
def q_episode(st, Q, TRs, gm, et, prF, trk, fins, trace):
    cumReward = 0
    for t in itertools.count(): # loop for exploration
        if st[:2] in fins: # if curr state is a goal
            break
            
        tr = TRs[st] # all possible transitions from curr state
        attempt = epsGreedy(Q[st], list(tr.keys())) # desired act, eps-greedy
        if random.random() < prF: # failed to accelerate
            actual = (0,0) # fail to accelerate
        else: # successfully change accelerattion
            actual = attempt 
            
        newSt,reward = tr[actual] # next state and reward for action
        maxQ = max( Q[nextSt][k] for k in TRs[newSt].keys() ) # Q of new state
        Q[st][actual] += et*(reward + gm*maxQ - Q[st][actual]) # update Q(s,a)
        st = newSt
        cumReward += reward

    return Q, t, cumReward # return Q, 

In [111]:
################################################################################
def qLearning(states, accels, goals, nEpisodes=int(1e6), gamma=0.9, eta=0.1,
              pr_fail=0.2, trace=False):
    TRs = getTransitions(states, accels, track) # all trans from all states
    states = [x for x in states if x[:2] not in goals] # remove goal states
    
    Qs = dict()
    for st in states: # initialize Q table to all 0's
        Qs[st] = {a: 0 for a in acts}
        
    epsLen = np.zeros(nEpisodes, int) # length of each episode
    epsReward = np.zeros(nEpisodes, int) # length of each 

    for ep in range(nEpisodes):
        start = random.choice(states) # choose random starting location
        # run one episode of q-learning, get new Qs, ep. len, ep. cum. reward
        Qs,epsLen[ep],epsReward[ep] = qEpisode(start, Qs, TRs, gamma, 
                                               eta, pr_fail, goals, trace)
    
    policy = dict() # pre-allocate policy
    for st in states: # loop over all states to get policy for each state
        tmp, policy[st] = getVPolicyFromQ(Qs) # best policy accord. to Qs
    return policy, epsLen, epsReward # return policy and stats

In [114]:
a = TRss[(21,17,0,-3)]

In [59]:
a,b,c = valueIteration(O_states, O_track, accels, trace=True)

[00000]: diff=1.000000
[00010]: diff=0.348678
[00020]: diff=0.121577
[00030]: diff=0.004735
[00040]: diff=0.000000
Total iters: 45, max util diff = 0.000000


In [48]:
TRss = getTransitions(O_states,accels,O_track)

z = transFromState((2,15,1,-2),accels,O_track)
list(z.items())[0]

((1, 1), ((4, 14, 2, -1), -1))

In [49]:
TRss[next(iter(O_goals))+(0,0)]

{(0, 0): ((4, 12, 0, 0), 0)}

In [89]:
next(iter(O_states))[:2]

(21, 17)