# ING Hubs Hackaton

29.01.2021

Jaime Elguero, Santiago Gil, Borja Serra

# Open AIGym

Gym is a toolkit for developing and comparing reinforcement learning algorithms. It makes no assumptions about the structure of your agent, and is compatible with any numerical computation library, such as TensorFlow or Theano.

The gym library is a collection of test problems — environments — that you can use to work out your reinforcement learning algorithms. These environments have a shared interface, allowing you to write general algorithms.

In [1]:
import gym

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
#env.reset()
#env.seed(1); np.random.seed(1)

**Mountain Car** is a classic reinforcement learning problem where the objective is to create an algorithm that learns to climb a steep hill to reach the goal marked by a flag. The car’s engine is not powerful enough to drive up the hill without a head start so the car must drive up the left hill to obtain enough momentum to scale the steeper hill to the right and reach the goal.

In [3]:
initial_learning_rate = 0.2
discount = 0.9
epsilon  = 0.8
min_eps  = 0
episodes = 10000

state = env.reset()

# Determine size of discretized state space
num_states = (env.observation_space.high - env.observation_space.low) * np.array([10, 100])
num_states = np.round(num_states, 0).astype(int) + 1

# Initialize Q table
Q = np.random.uniform(low = 0, high = 0, 
                      size = (num_states[0], num_states[1], 
                              env.action_space.n))

# Initialize Q table
revisited = np.random.uniform(low = 0, high = 0,
                              size = (num_states[0], num_states[1]))

# Initialize variables to track rewards
reward_list = []
ave_reward_list = []

max_position = state[0]
positions = np.ndarray([0,2])
successful = []
position = []

# Calculate episodic reduction in epsilon
reduction = (epsilon - min_eps)/episodes

# Run Q learning algorithm
for i in range(episodes):
        
    # Initialize parameters
    done = False
    tot_reward, reward = 0,0
    state = env.reset()
    
    max_position_episode = state[0]

    # Discretize state
    state_adj = (state - env.observation_space.low)*np.array([10, 100])
    state_adj = np.round(state_adj, 0).astype(int)

    while done != True:
        # Render environment for last five episodes
        #if i >= (episodes - 5):
        #    env.render()

        # Determine next action - epsilon greedy strategy
        if np.random.random() < 1 - epsilon:
            action = np.argmax(Q[state_adj[0], state_adj[1]])
        else:
            action = np.random.randint(0, env.action_space.n)

        # Get next state and reward
        state2, reward, done, info = env.step(action)
        
        if state2[0] > max_position:
            max_position = state2[0]
            positions = np.append(positions, [[i, max_position]], axis=0)
    
        # Discretize state2
        state2_adj = (state2 - env.observation_space.low) * np.array([10, 100])
        state2_adj = np.round(state2_adj, 0).astype(int)

        # Allow for terminal states
        if done:
            position.append(state2[0])
            if state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                successful.append(i)

        # Adjust Q value for current state
        else:
            delta = learning*(reward + 
                              discount*np.max(Q[state2_adj[0], 
                                                state2_adj[1]]) - 
                              Q[state_adj[0], state_adj[1], action])
            Q[state_adj[0], state_adj[1],action] += delta
            revisited[state_adj[0], state_adj[1]] += 1

        # Update variables
        tot_reward += reward
        state_adj = state2_adj

    # Decay epsilon
    if epsilon > min_eps:
        epsilon -= reduction

    # Track rewards
    reward_list.append(tot_reward)

    if (i+1) % 100 == 0:
        ave_reward = np.mean(reward_list)
        ave_reward_list.append(ave_reward)
        reward_list = []
        print('Episode {} Average Reward: {}'.format(i+1, ave_reward))

env.close()

# Run Q-learning algorithm
#rewards = QLearning(env, 0.2, 0.9, 0.8, 0, 5000)

NameError: name 'learning' is not defined

In [None]:
plt.figure(2, figsize=[10,5])
p = pd.Series(position)
ma = p.rolling(10).mean()
plt.plot(p, alpha=0.8)
plt.plot(ma)
plt.xlabel('Episode')
plt.ylabel('Position')
plt.title('Car Final Position')
plt.show()

In [None]:
# Plot Rewards
plt.subplots(figsize=(10,5))
plt.plot(100*(np.arange(len(ave_reward_list)) + 1), ave_reward_list)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes');

In [None]:
print('Furthest Position: {}'.format(max_position))
fig, ax = plt.subplots(figsize=(10,5))
ax2 = ax.twinx()
ax.plot(positions[:,0], positions[:,1])
ax2.hist(successful, bins = int(episodes/500));
ax.set_xlabel('Episodes');
ax.set_ylabel('Furthest Position');
ax2.set_ylabel('#Successful');
print('successful episodes: {}'.format(np.count_nonzero(successful)))

In [None]:
Q.argmax(axis=2)

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np

data = np.random.rand(10, 10) * 20

# create discrete colormap
cmap = colors.ListedColormap(['red', 'blue', 'green', 'black'])
bounds = [0,0.5,1.5,2.5,3]
norm = colors.BoundaryNorm(bounds, cmap.N)

Q_colors = Q.argmax(axis=2)
Q_colors[(revisited/revisited.sum()) < 0.0001] = 4

fig, ax = plt.subplots(figsize=(Q_colors.shape))
ax.imshow(Q_colors, cmap=cmap, norm=norm)

# draw gridlines
ax.grid(which='major', axis='both', linestyle='-', color='k', linewidth=2)
ax.set_xticks(np.arange(-.5, Q_colors.shape[1], 1));
ax.set_xticklabels(np.round(np.arange(env.observation_space.low[1], env.observation_space.high[1], 0.01), 2))
ax.set_xlabel('velocity')
ax.set_yticks(np.arange(-.5, Q_colors.shape[0], 1));
ax.set_yticklabels(np.round(np.arange(env.observation_space.low[0], env.observation_space.high[0], 0.1), 2))
ax.set_ylabel('position')

plt.show()

#        0 - Red     Accelerate to the Left
#        1 - Blue    Don't accelerate
#        2 - Green   Accelerate to the Right

# Tweaking it

In [None]:
initial_learning_rate = 1.0  # initial learning rate
min_learning_rate = 0.001    # minimum learning rate
discount = 1
epsilon  = 0.1
min_eps  = 0
episodes = 10000

state = env.reset()

# Determine size of discretized state space
num_states = (env.observation_space.high - env.observation_space.low) * np.array([10, 100])
num_states = np.round(num_states, 0).astype(int) + 1

# Initialize Q table
Q = np.random.uniform(low = 0, high = 0, 
                      size = (num_states[0], num_states[1], 
                              env.action_space.n))

# Initialize Q table
revisited = np.random.uniform(low = 0, high = 0,
                              size = (num_states[0], num_states[1]))

# Initialize variables to track rewards
reward_list = []
ave_reward_list = []

max_position = state[0]
positions = np.ndarray([0,2])
successful = []
position = []

# Calculate episodic reduction in epsilon
reduction = (epsilon - min_eps)/episodes

# Run Q learning algorithm
for i in range(episodes):
    
    learning = max(min_learning_rate, initial_learning_rate * (0.85 ** (i//100)))
    
    # Initialize parameters
    done = False
    tot_reward, reward = 0,0
    state = env.reset()
    
    max_position_episode = state[0]

    # Discretize state
    state_adj = (state - env.observation_space.low)*np.array([10, 100])
    state_adj = np.round(state_adj, 0).astype(int)

    while done != True:
        # Render environment for last five episodes
        #if i >= (episodes - 5):
        #    env.render()

        # Determine next action - epsilon greedy strategy
        if np.random.random() < 1 - epsilon:
            action = np.argmax(Q[state_adj[0], state_adj[1]])
        else:
            action = np.random.randint(0, env.action_space.n)

        # Get next state and reward
        state2, reward, done, info = env.step(action)
        
        if state2[0] > max_position:
            max_position = state2[0]
            positions = np.append(positions, [[i, max_position]], axis=0)

        #if state2[0] > max_position_episode:
        #    max_position_episode = state2[0]
        #    reward = state2[0] + 0.5
    
        # Discretize state2
        state2_adj = (state2 - env.observation_space.low) * np.array([10, 100])
        state2_adj = np.round(state2_adj, 0).astype(int)

        # Allow for terminal states
        if done:
            position.append(state2[0])
            if state2[0] >= 0.5:
                #reward += 1
                epsilon *= .99
                Q[state_adj[0], state_adj[1], action] = reward
                successful.append(i)

        # Adjust Q value for current state
        else:
            delta = learning*(reward + 
                              discount*np.max(Q[state2_adj[0], 
                                                state2_adj[1]]) - 
                              Q[state_adj[0], state_adj[1], action])
            Q[state_adj[0], state_adj[1],action] += delta
            revisited[state_adj[0], state_adj[1]] += 1

        # Update variables
        tot_reward += reward
        state_adj = state2_adj

    # Decay epsilon
    #if epsilon > min_eps:
    #    epsilon -= reduction

    # Track rewards
    reward_list.append(tot_reward)

    if (i+1) % 100 == 0:
        ave_reward = np.mean(reward_list)
        ave_reward_list.append(ave_reward)
        reward_list = []
        print('Episode {} Average Reward: {}'.format(i+1, ave_reward))

env.close()

# Run Q-learning algorithm
#rewards = QLearning(env, 0.2, 0.9, 0.8, 0, 5000)

In [None]:
plt.figure(2, figsize=[10,5])
p = pd.Series(position)
ma = p.rolling(10).mean()
plt.plot(p, alpha=0.8)
plt.plot(ma)
plt.xlabel('Episode')
plt.ylabel('Position')
plt.title('Car Final Position')
plt.show()

In [None]:
# Plot Rewards
plt.subplots(figsize=(10,5))
plt.plot(100*(np.arange(len(ave_reward_list)) + 1), ave_reward_list)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes');

In [None]:
print('Furthest Position: {}'.format(max_position))
fig, ax = plt.subplots(figsize=(10,5))
ax2 = ax.twinx()
ax.plot(positions[:,0], positions[:,1])
ax2.hist(successful, bins = int(episodes/500));
ax.set_xlabel('Episodes');
ax.set_ylabel('Furthest Position');
ax2.set_ylabel('#Successful');
print('successful episodes: {}'.format(np.count_nonzero(successful)))

In [None]:
Q_colors = Q.argmax(axis=2)
Q_colors

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np

data = np.random.rand(10, 10) * 20

# create discrete colormap
cmap = colors.ListedColormap(['red', 'blue', 'green', 'black'])
bounds = [0,0.5,1.5,2.5,3]
norm = colors.BoundaryNorm(bounds, cmap.N)

Q_colors = Q.argmax(axis=2)
Q_colors[(revisited/revisited.sum()) < 0.001] = 4

fig, ax = plt.subplots(figsize=(Q_colors.shape))
ax.imshow(Q_colors, cmap=cmap, norm=norm)

# draw gridlines
ax.grid(which='major', axis='both', linestyle='-', color='k', linewidth=2)
ax.set_xticks(np.arange(-.5, Q_colors.shape[1], 1));
ax.set_xticklabels(np.round(np.arange(env.observation_space.low[1], env.observation_space.high[1], 0.01), 2))
ax.set_xlabel('velocity')
ax.set_yticks(np.arange(-.5, Q_colors.shape[0], 1));
ax.set_yticklabels(np.round(np.arange(env.observation_space.low[0], env.observation_space.high[0], 0.1), 2))
ax.set_ylabel('position')

plt.show()

#        0 - Red     Accelerate to the Left
#        1 - Blue    Don't accelerate
#        2 - Green   Accelerate to the Right

# Send results

In [18]:
import hackathon_mad_hub_2101 as hack_tools

team = 'La Macarena'
score = 69

hack_tools.post_score(team, score)

"Score posted!"

