In [20]:
import gym, tiles3

In [21]:
import numpy as np

In [22]:
class StateSpaceTiler:
    def __init__(self, size, tilings, num_tiles, x_min, x_max, y_min, y_max):
        self.iht = tiles3.IHT(size)
        self.tilings = tilings
        self.num_tiles = num_tiles
        self.x_transform = lambda x: ((x - x_min)*self.num_tiles)/(x_max - x_min)
        self.y_transform = lambda y: ((y - y_min)*self.num_tiles)/(y_max - y_min)
    
    def get_encoding(self, x, y):
        return np.array(tiles3.tiles(self.iht, self.tilings, [self.x_transform(x), self.y_transform(y)]))

In [23]:
class Agent:
    def __init__(self, params):
        self.alpha_r = params.get("alpha_r")
        self.tilings = params.get("tilings")
        self.num_tiles = params.get("num_tiles")
        self.action_values = params.get("actions")
        self.iht_size = params.get("iht_size")
        self.alpha_w = params.get("alpha_w")/self.tilings
        self.alpha_theta =  params.get("alpha_t")/self.tilings
        self.tiler = StateSpaceTiler(self.iht_size,self.tilings, self.num_tiles, -np.pi, np.pi, -2*np.pi, 2*np.pi)
        self.weights = np.zeros((self.iht_size, ))
        self.policy = np.zeros((len(self.action_values), self.iht_size))
        self.reward_mean = 0
    
    def get_reward(self):
        return self.reward_mean
    
    def softmax_dist(self, state):
        return np.sum(self.policy[:, state], axis=-1)
    
    def process(self, position):
        sign = position/abs(position)
        position = abs(position) % (2*np.pi)
        if position > np.pi:
            position -= 2*np.pi
        return position*sign
    
    def softmax_prob(self, state):
        dist = self.softmax_dist(state)
        max_val = np.max(dist)
        dist -= max_val
        p = np.exp(dist)
        return p/(np.sum(p))
    def softmax_action(self, state):
        return np.random.choice(len(self.action_values), p = self.softmax_prob(state))
        
    def agent_init(self, env_state):
        position, self.velocity = env_state
        self.position = self.process(position)
        self.previous_state = self.tiler.get_encoding(self.position, self.velocity)
        self.previous_action = self.softmax_action(self.previous_state)
        
        return self.action_values[self.previous_action] 

    def get_value(self, state):
        return (np.sum(self.weights[state]))
    
    def agent_step(self, env_state, reward):
        position, self.velocity = env_state
        self.position = self.process(position)
        current_state = self.tiler.get_encoding(self.position, self.velocity)
        td_error = reward - self.reward_mean + self.get_value(current_state) - self.get_value(self.previous_state)
        self.reward_mean = (self.reward_mean + self.alpha_r*td_error)/(1+self.alpha_r)
        
        self.weights[self.previous_state] += self.alpha_w*td_error
        prob_dist = self.softmax_prob(self.previous_state)
        for action in range(len(self.action_values)):
            prob_scale = prob_dist[action]
            if action == self.previous_action:
                prob_scale = 1  - prob_dist[action]
            self.policy[action][self.previous_state] += self.alpha_theta*td_error*prob_scale
        
        self.previous_state = current_state
        self.previous_action = self.softmax_action(self.previous_state)
        
        return self.action_values[self.previous_action]
    
        
        

In [24]:
env = gym.make('Pendulum-v0')

In [25]:
agent_params = {
    "alpha_r": 2**-6,
    "tilings": 32,
    "num_tiles": 8,
    "actions": [-1, 0, 1],
    "iht_size": 4096, 
    "alpha_w": 2,
    "alpha_t": 2**(-2),
}

In [26]:
agent = Agent(agent_params)

In [27]:
num_steps = 100000
env.reset()
action = agent.agent_init(env.state)

In [28]:
x_points = []
y_points = []
for _ in range(num_steps):
    observation, reward, done, info = env.step([action])
    env.render()
    action = agent.agent_step(env.state, reward)
    if _ % 1000 == 0:
        print ("Value is {0}".format(agent.get_reward()))
        print ("State is Position: {0}, Velocity {1}".format(agent.position, agent.velocity))
        x_points.append(_/1000)
        y_points.append(agent.get_reward())


Value is -0.041679469914360286
State is Position: -1.6748229696453092, Velocity -0.5921242069630042
Value is -4.190547469490732
State is Position: 3.0860103345577405, Velocity -0.07820619167138199
Value is 89.60390885199628
State is Position: -3.1084284221164493, Velocity -0.11324422997095229
Value is -3043625.114698002
State is Position: -3.0978462306437318, Velocity 0.246255506132444
Value is -5735956.286900528
State is Position: -2.815114998777849, Velocity 0.4432383046443114
Value is 1390468.9590163052
State is Position: -3.0628061968751243, Velocity -0.4501589709045849
Value is 1301543.7987738084
State is Position: -2.8150856313308923, Velocity 0.443116285263254
Value is 488381.7196546548
State is Position: -3.062836068247699, Velocity -0.45003704922838467
Value is 458337.92057427776
State is Position: -2.815056273662313, Velocity 0.44299423177871966
Value is 158092.75284317255
State is Position: -3.062865929793813, Velocity -0.44991509124498225
Value is 196602.8291177508
State is

Value is -2.849204060265704
State is Position: -3.063993363386954, Velocity -0.44525388068290683
Value is -2.910970184583251
State is Position: -2.8139189756353318, Velocity 0.4382076494595742
Value is -2.8468605403696476
State is Position: -3.064022839828041, Velocity -0.4451305143403398
Value is -2.9676842042018046
State is Position: -2.8138900109392226, Velocity 0.43808423986480943
Value is -2.8476368839363038
State is Position: -3.0640523063468197, Velocity -0.4450071120822293
Value is -2.9078850624415145
State is Position: -2.8138610561131467, Velocity 0.4379607965686937
Value is -2.8206820970685134
State is Position: -3.064081762940905, Velocity -0.4448836739184361
Value is -2.8737118586133943
State is Position: -2.8138321111593223, Velocity 0.43783731958098043
Value is -2.807405790081485
State is Position: -3.0641112096079546, Velocity -0.44476019985873805
Value is -2.867121012741098
State is Position: -2.8138031760799525, Velocity 0.43771380891160944
Value is -2.795145449862641

In [10]:
env.close()