In [10]:
import gym, tiles3

In [11]:
import numpy as np
import matplotlib.pyplot as plt

In [12]:
class StateSpaceTiler:
    def __init__(self, size, tilings, num_tiles, x_min, x_max, y_min, y_max):
        self.iht = tiles3.IHT(size)
        self.tilings = tilings
        self.num_tiles = num_tiles
        self.x_transform = lambda x: ((x - x_min)*self.num_tiles)/(x_max - x_min)
        self.y_transform = lambda y: ((y - y_min)*self.num_tiles)/(y_max - y_min)
    
    def get_encoding(self, x, y):
        return np.array(tiles3.tiles(self.iht, self.tilings, [self.x_transform(x), self.y_transform(y)]))

In [13]:
class Agent:
    def __init__(self, params, train=False):
        self.alpha_r = params.get("alpha_r")
        self.tilings = params.get("tilings")
        self.num_tiles = params.get("num_tiles")
        self.action_values = params.get("actions")
        self.iht_size = params.get("iht_size")
        self.alpha_w = params.get("alpha_w")/self.tilings
        self.alpha_theta =  params.get("alpha_t")/self.tilings
        self.tiler = StateSpaceTiler(self.iht_size,self.tilings, self.num_tiles, -np.pi, np.pi, -2*np.pi, 2*np.pi)
        self.weights = np.zeros((self.iht_size, ))
        self.policy = np.zeros((len(self.action_values), self.iht_size))
        self.reward_mean = 0
        if not train:
            self.load()
    
    def get_reward(self):
        return self.reward_mean
    def save(self):
        np.save('policy.npy', self.policy)
        np.save('weights.npy', self.weights)
        
    def load(self):
        self.weights = np.load('weights.npy')
        self.policy  = np.load('policy.npy')
    
    def softmax_dist(self, state):
        return np.sum(self.policy[:, state], axis=-1)
    
    def process(self, position):
        sign = position/abs(position)
        position = abs(position) % (2*np.pi)
        if position > np.pi:
            position -= 2*np.pi
        return position*sign
    
    def softmax_prob(self, state):
        dist = self.softmax_dist(state)
        max_val = np.max(dist)
        dist -= max_val
        p = np.exp(dist)
        return p/(np.sum(p))
    def softmax_action(self, state):
        return np.random.choice(len(self.action_values), p = self.softmax_prob(state))
        
    def agent_init(self, env_state):
        position, self.velocity = env_state
        self.position = self.process(position)
        self.previous_state = self.tiler.get_encoding(self.position, self.velocity)
        self.previous_action = self.softmax_action(self.previous_state)
        
        return self.action_values[self.previous_action] 

    def get_value(self, state):
        return (np.sum(self.weights[state]))
    
    def agent_step(self, env_state, reward):
        position, self.velocity = env_state
        self.position = self.process(position)
        current_state = self.tiler.get_encoding(self.position, self.velocity)
        td_error = reward - self.reward_mean + self.get_value(current_state) - self.get_value(self.previous_state)
        self.reward_mean = (self.reward_mean + self.alpha_r*td_error)/(1+self.alpha_r)
        
        self.weights[self.previous_state] += self.alpha_w*td_error
        prob_dist = self.softmax_prob(self.previous_state)
        for action in range(len(self.action_values)):
            prob_scale = -prob_dist[action]
            if action == self.previous_action:
                prob_scale = 1  - prob_dist[action]
            self.policy[action][self.previous_state] += self.alpha_theta*td_error*prob_scale
        
        self.previous_state = current_state
        self.previous_action = self.softmax_action(self.previous_state)
        
        return self.action_values[self.previous_action]
    
    def play(self, env_state):
        position, velocity = env_state
        position = self.process(position)
        current_state = self.tiler.get_encoding(position, velocity)
        dist = self.softmax_prob(current_state)
        
        return self.action_values[np.argmax(dist)]
    
        
        

In [14]:
env = gym.make('Pendulum-v0')

In [15]:
agent_params = {
    "alpha_r": 2**-6,
    "tilings": 32,
    "num_tiles": 8,
    "actions": [-1, 0, 1],
    "iht_size": 4096, 
    "alpha_w": 0.001,
    "alpha_t": 2**(-5),
}

In [21]:
agent = Agent(agent_params, True)

In [22]:
def train(agent, agent_params, env):
    num_steps = 400000
    env.reset()
    action = agent.agent_init(env.state)

    x_points = []
    y_points = []
    for _ in range(num_steps):
        observation, reward, done, info = env.step([action])
        env.render()
        action = agent.agent_step(env.state, reward)
        if _ % 10000 == 0:
            print ("Cycle {0} Value is {1}".format(_//10000, agent.get_reward()))
            print ("State is Position: {0}, Velocity {1}".format(agent.position, agent.velocity))
            x_points.append(_/1000)
            y_points.append(agent.get_reward())
    plt.plot(x_points, y_points)
    plt.show()
    agent.save()

In [23]:
def test(agent, env):
    env.reset()
    play_steps = 10000000
    for _ in range(play_steps):
        action = agent.play(env.state)
        env.step([action])
        env.render()

In [None]:
train(agent, agent_params, env)

Cycle 0 Value is -3.08919222088632
State is Position: 1.2686020703193848, Velocity 1.2167422855387202
Cycle 1 Value is -1.5634416053467228
State is Position: -0.4308779475135509, Velocity 0.5567798140435356
Cycle 2 Value is -2.249385520332149
State is Position: 1.948679662198586, Velocity -6.69743617746083
Cycle 3 Value is -2.2100983701477346
State is Position: -2.763140013970556, Velocity 8.0
Cycle 4 Value is -2.5022403634563326
State is Position: 1.4924107523752284, Velocity -2.274765582441436
Cycle 5 Value is -2.4868818561670367
State is Position: 1.3438458033013134, Velocity -1.1266808591840085
Cycle 6 Value is -2.8359872002497735
State is Position: -1.48864246252856, Velocity -2.0412946982671283
Cycle 7 Value is -2.813736083308669
State is Position: -2.155107076741217, Velocity -4.416853511945435
Cycle 8 Value is -2.699379743043648
State is Position: 2.098104071915081, Velocity -4.082452746938674
Cycle 9 Value is -2.769517654841018
State is Position: -1.3713662048361641, Velocity 