## Import libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import gym
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque

## Set variables

In [None]:
learning_rate = 0.5 # was 0.001
state_size = 2
action_size = 3

memory_size = 10000
batch_size = 64 # was 32

gamma = 0.99 # was 0.99
alpha = 0.1

## Import environment

In [None]:
#env = gym.make("MountainCar-v0") # capped at 200 moves
env = gym.make("MountainCar-v0").env # unlimited moves

## Create model

In [None]:
q_network = Sequential()

q_network.add(Dense(256, activation='relu', input_dim=state_size))
q_network.add(Dense(256, activation='relu'))
q_network.add(Dense(action_size, activation='linear'))

optimizer = Adam(lr=learning_rate)
q_network.compile(loss='mse', optimizer=optimizer)

## Train model

In [None]:
rewards_list = []
epsilon = 0.1
epsilon_decay = 0.993

memory = deque(maxlen=memory_size)

for episode in range(200):
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    done = False
    
    if (epsilon > 0.01):
        epsilon *= epsilon_decay
    
    while not done:
        
        env.render()
        
        # Choose action
        if epsilon > np.random.rand():
            action = env.action_space.sample()
        else:
            action = np.argmax(q_network.predict(state)[0])
        
        # Apply action to environment, get next state/reward, and save to memory container
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        total_reward += reward
        
        if done:
            next_state = np.zeros(state.shape) # end of episode
            
        memory.append((state, action, reward, next_state))
        state = next_state

        # Train model
        if (len(memory) >= batch_size):
            
            minibatch = random.sample(list(memory), batch_size)
            
            x_train = np.zeros((batch_size, state_size))
            y_train = np.zeros((batch_size, action_size))
            
            for i, (mb_state, mb_action, mb_reward, mb_next_state) in enumerate(minibatch):
                
                x_train[i] = mb_state
                y_train[i] = q_network.predict(np.reshape(mb_state, [1, state_size]))
                
                if (np.array_equal(mb_next_state, np.zeros(state.shape))): # end of episode
                    q_target = mb_reward
                    
                else:
                    q_target = mb_reward + (gamma * np.max(q_network.predict(mb_next_state)))

                q_predict = y_train[i][mb_action]
                y_train[i][mb_action] += alpha * (q_target - q_predict)
                            
            q_network.fit(x_train, y_train, epochs=1, verbose=0)

            
    if ((episode+1) % 1 == 0):
        print('Episode: {}'.format(episode+1),
              'Total reward: {}'.format(total_reward),
              'Explore P: {:.4f}'.format(epsilon))
                
    rewards_list.append((episode, total_reward))
    
env.render(close=True)

## Test model

In [None]:
for episode in range(5):
    
    counter = 0
    done = False
    state = env.reset()

    while not done:
        env.render()
        state = np.reshape(state, [1, state_size])
        action = np.argmax(q_network.predict(state)[0])
        state, reward, done, _ = env.step(action)
        counter += 1

    print(episode + 1, counter)
                
env.render(close=True)