# Control of mountain car using defferential semi-gradient Sarsa with tiling
# by Ehsan Kouchaki


In [1]:
import gym
import numpy as np
env = gym.make("MountainCar-v0")
import time

# Tiling Code from Sutton: http://incompleteideas.net/tiles/tiles3.py-remove

In [2]:
basehash = hash

class IHT:
    "Structure to handle collisions"
    def __init__(self, sizeval):
        self.size = sizeval                        
        self.overfullCount = 0
        self.dictionary = {}

    def __str__(self):
        "Prepares a string for printing whenever this object is printed"
        return "Collision table:" + \
               " size:" + str(self.size) + \
               " overfullCount:" + str(self.overfullCount) + \
               " dictionary:" + str(len(self.dictionary)) + " items"

    def count (self):
        return len(self.dictionary)
    
    def fullp (self):
        return len(self.dictionary) >= self.size
    
    def getindex (self, obj, readonly=False):
        d = self.dictionary
        if obj in d: return d[obj]
        elif readonly: return None
        size = self.size
        count = self.count()
        if count >= size:
            if self.overfullCount==0: print('IHT full, starting to allow collisions')
            self.overfullCount += 1
            return basehash(obj) % self.size
        else:
            d[obj] = count
            return count

def hashcoords(coordinates, m, readonly=False):
    if type(m)==IHT: return m.getindex(tuple(coordinates), readonly)
    if type(m)==int: return basehash(tuple(coordinates)) % m
    if m==None: return coordinates

from math import floor, log
from itertools import zip_longest

def tiles (ihtORsize, numtilings, floats, ints=[], readonly=False):
    """returns num-tilings tile indices corresponding to the floats and ints"""
    qfloats = [floor(f*numtilings) for f in floats]
    Tiles = []
    for tiling in range(numtilings):
        tilingX2 = tiling*2
        coords = [tiling]
        b = tiling
        for q in qfloats:
            coords.append( (q + b) // numtilings )
            b += tilingX2
        coords.extend(ints)
        Tiles.append(hashcoords(coords, ihtORsize, readonly))
    return Tiles

# Train

In [3]:
d = 4096
W = np.zeros((d))
X = np.zeros((d))
TilingNum = 8            #number of tilings
alpha = 0.1/TilingNum    # step size
gamma = 1
q_hat = np.zeros((3))
iht=IHT(d)

stability = 0
for ep in range(500):
    state = env.reset()    
    for ac in range(3):
        feature_idx = tiles(iht, TilingNum, [TilingNum*state[0]/(0.5+1.2),
                                             TilingNum*state[1]/(0.07+0.07)], [ac])
        for i in range(len(feature_idx)):
            q_hat[ac] += W[feature_idx[i]]            
    action = np.argmax(q_hat)
    if stability > 20:
        print('\n The learning prosecc terminated in episode number: ', ep, 'step number: ', step)
        break
    step = 0        
    while True:
        step += 1
        state_prim, R, done, _ = env.step(action)        
        delta = R - q_hat[action]
        alpha_delta = alpha * delta
        
        Feature_idx = tiles(iht, TilingNum, [TilingNum*state[0]/(0.5+1.2),
                                             TilingNum*state[1]/(0.07+0.07)], [action])
        if done == True:       # if we are in terminal state
            X = np.zeros((d))
            for i in range(len(Feature_idx)):
                X[Feature_idx[i]] = 1            
            W += alpha_delta * X
            if step < 200:
                stability += 1
            else:
                stability = 0
            print(env.step(action), 'step = ', step)
            
            break
            
        q_hat = np.zeros((3))
        for ac in range(3):
            feature_idx = tiles(iht, TilingNum, [TilingNum*state_prim[0]/(0.5+1.2),
                                                 TilingNum*state_prim[1]/(0.07+0.07)], [ac])
            for i in range(len(feature_idx)):
                q_hat[ac] += W[feature_idx[i]]            
        action = np.argmax(q_hat)
        delta += gamma*np.max(q_hat)
        alpha_delta = alpha * delta
        for i in range(len(Feature_idx)):
            W[Feature_idx[i]] += alpha_delta
        state = state_prim


(array([-0.47607297,  0.01600545]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.46096285,  0.02175793]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.70683656,  0.02336576]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.60680268,  0.02872425]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.69157173,  0.02312489]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.63959623,  0.02941225]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.59055094,  0.02459426]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.79326211,  0.00765249]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.81003554,  0.02314804]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.42824317, -0.0298488 ]), -1.0, True, {'TimeLimit.truncated': True}) step =  200
(array([-0.45513045,  0.03705392]), -1.0, True, {'TimeLimit.truncated': True}) step =  200

# Test

In [4]:
for ep in range(20):
    state = env.reset()    
    step = 0
    while True:
        step += 1
        env.render()
        time. sleep(.01)
        state_prim, R, done, _ = env.step(action) 
        
        Feature_idx = tiles(iht, TilingNum, [TilingNum*state[0]/(0.5+1.2),
                                             TilingNum*state[1]/(0.07+0.07)], [action])
        if done == True:       # if we are in terminal state
            print('The state in episode: ', ep + 1, 'is: ', env.step(action), 'step = ', step)
            break
            
        q_hat = np.zeros((3))
        for ac in range(3):
            feature_idx = tiles(iht, TilingNum, [TilingNum*state_prim[0]/(0.5+1.2),
                                                 TilingNum*state_prim[1]/(0.07+0.07)], [ac])
            for i in range(len(feature_idx)):
                q_hat[ac] += W[feature_idx[i]]            
        action = np.argmax(q_hat)
        state = state_prim
env.close()

The state in episode:  1 is:  (array([0.56737123, 0.04322614]), -1.0, True, {}) step =  167
The state in episode:  2 is:  (array([0.53519077, 0.03340791]), -1.0, True, {}) step =  121
The state in episode:  3 is:  (array([0.5362979 , 0.02519579]), -1.0, True, {}) step =  129
The state in episode:  4 is:  (array([0.57466205, 0.04462355]), -1.0, True, {}) step =  173
The state in episode:  5 is:  (array([0.55847023, 0.03892359]), -1.0, True, {}) step =  170
The state in episode:  6 is:  (array([0.57355846, 0.04337354]), -1.0, True, {}) step =  168
The state in episode:  7 is:  (array([0.52890017, 0.02343995]), -1.0, True, {}) step =  127
The state in episode:  8 is:  (array([0.53851369, 0.02588231]), -1.0, True, {}) step =  130
The state in episode:  9 is:  (array([0.54497444, 0.02640838]), -1.0, True, {}) step =  128
The state in episode:  10 is:  (array([0.5467539 , 0.02535753]), -1.0, True, {}) step =  131
The state in episode:  11 is:  (array([0.54320241, 0.04243572]), -1.0, True, {}