In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('MountainCar-v0')

In [3]:
class Agent:
    def __init__(self, env):
        self.min_pos, self.max_pos = env.observation_space.low[0], env.observation_space.high[0]
        self.min_vel, self.max_vel = env.observation_space.low[1], env.observation_space.high[1]
        self.actions = env.action_space.n

        self.prob, self.step_prob = 0.2, 0.0002
        self.step_pos, self.step_vel = 10.0, 100.0
        board_size = (self.PosIndex(self.max_pos) + 2, self.VelIndex(self.max_vel) + 2, self.actions)
        self.board = np.random.uniform(low = -1, high = 1, size = board_size)

    def PosIndex(self, pos):
        return int((pos - self.min_pos) * self.step_pos)

    def VelIndex(self, vel):
        return int((vel - self.min_vel) * self.step_vel)

    def Move(self, state):
        if np.random.uniform() < self.prob:
            return np.random.randint(self.actions)
        return np.argmax(self.board[self.PosIndex(state[0]), self.VelIndex(state[1])])

    def UpdateBoard(self, old_state, new_state, action, reward):
        old_pos = self.PosIndex(old_state[0])
        old_vel = self.VelIndex(old_state[1])
        if new_state[0] >= 0.5:
            self.board[old_pos, old_vel, action] = max(0, self.board[old_pos, old_vel, action] + 0.1)
        else:
            self.board[old_pos, old_vel, action] += 0.15 * (reward +
                                 0.9 * np.max(self.board[self.PosIndex(new_state[0]),
                                                          self.VelIndex(new_state[1])]) -
                                 self.board[old_pos, old_vel, action])

    def ReduceRandom(self):
        self.prob = max(0, self.prob - self.step_prob)


In [4]:
class ResultTracker:
    def __init__(self, round_length):
        self.counter = 0
        self.round_length = round_length
        self.results = []
        self.extras = []

    def AddResult(self, amount):
        self.results.append(amount)
        if len(self.results) >= self.round_length:
            self.counter += 1
            # self.PrintResults()
            self.results = []

    def PrintResults(self):
        average = sum(self.results) / len(self.results)
        best = max(self.results)
        print "#{}: {} average over {} rounds, best is {}".format(self.counter,
                                                                  average, len(self.results), best)

In [5]:
agent = Agent(env)
tracker = ResultTracker(100)
tries = 10000
for i in range(tries):
    state = env.reset()
    score = 0
    done = False
    while not done:
        action = agent.Move(state)
        new_state, reward, done, info = env.step(action)
        agent.UpdateBoard(state, new_state, action, reward)
        state = new_state
        score += reward
    tracker.AddResult(score)
    agent.ReduceRandom()

Just for fun: compare solution that actually makes an effort to use ML and a greedy agent written in 5 minutes (no physics included)

In [6]:
class GreedyAgent:
    def Move(self, state):
        return 0 if state[1] < 0.01 else 2

In [7]:
def GetAverageResult(tries, agent):
    amount = 0
    for i in range(tries):
        state = env.reset()
        score = 0
        done = False
        action = 0
        while not done:
            action = agent.Move(state)
            state, reward, done, info = env.step(action)
            score += reward
        amount += score
    return amount / tries

Final results for both Agent and GreedyAgent:

In [8]:
print "Average result over 10000 tries for Agent is {}".format(GetAverageResult(10000, agent))
print "Average result over 10000 tries for GreedyAgent is {}".format(GetAverageResult(10000, GreedyAgent()))

Average result over 10000 tries for Agent is -146.5849
Average result over 10000 tries for GreedyAgent is -158.3322
