In [None]:
import gym
import random
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from IPython.display import clear_output

In [None]:
env = gym.make("MountainCar-v0")

In [None]:
os_size = env.observation_space.shape[0]

print(f"Observation Space Size: {os_size}")
print(f"Observation Space Low : {env.observation_space.low}")
print(f"Observation Space High: {env.observation_space.high}")

print(f"\nAction Space: {env.action_space}")

In [None]:
class Estimator:
    def __init__(self):
        self.model = Sequential()
        
        self.model.add(Dense(8, input_shape=(os_size,), activation="relu"))
        self.model.add(Dense(8, activation="relu"))
        self.model.add(Dense(3, activation="softmax"))
        
        self.model.compile(optimizer="adam", 
                           loss="mse")
        
        self.model.summary()
        
    def predict(self, s):
        state = s.reshape(-1, os_size)
        
        return self.model.predict(state)
    
    def update(self, s, a, y):
        state = s.reshape(-1, os_size)
        
        td_target = self.predict(s)
        td_target[0][a] = y
        
        self.model.fit(state, td_target, verbose=0)

In [None]:
estimator = Estimator()

In [None]:
def getEpsilonGreedyPolicy(estimator, epsilon, nA):
    def policy(state):
        A = np.ones(nA) * (epsilon / nA)
        best_action = np.argmax(estimator.predict(state))
        A[best_action] = A[best_action] + (1 - epsilon)
        
        return A
    return policy

In [None]:
def QLearning(env, estimator, num_episodes, discount=1.0, epsilon=0.1, batch_size=16):
    replay_memory = []
    
    policy = getEpsilonGreedyPolicy(estimator, epsilon, env.action_space.n)
    
    episode_rewards = np.zeros(num_episodes)
    episode_finished = []
    
    for i_episode in range(num_episodes):
        state = env.reset()
        
        for t in range(200):
#             if (i_episode + 1) % 5000 == 0:
#                 env.render()
            
            action_prob = policy(state)
            action_pos = [i for i in range(env.action_space.n)]
            action = np.random.choice(action_pos, p=action_prob)
            
            next_state, reward, done, _ = env.step(action)
            
            episode_rewards[i_episode] = episode_rewards[i_episode] + reward
            replay_memory.append([state, action, reward, next_state, done])
            
            if len(replay_memory) > batch_size:
                replay_batch = random.sample(replay_memory, batch_size)
                
                for ss, aa, rr, ns, terminal in replay_batch:
                    td_target = rr
                    
                    if not terminal:
                        best_next_action_value = np.max(estimator.predict(ns))
                        
                        td_target = rr + discount * best_next_action_value
                        
                    estimator.update(ss, aa, td_target)
            
            if done:
                if episode_rewards[i_episode] > -200:
                    episode_finished.append([i_episode, episode_rewards[i_episode]])
                break
                
            state = next_state
        
        clear_output(True)
        print(f"[{i_episode + 1:>5}/{num_episodes:>5}] Episode Total Reward: {episode_rewards[i_episode]}")
            
    env.close()
    
    return episode_finished

In [None]:
episode_finished = QLearning(env, estimator, 100, epsilon=0.0005)

In [None]:
print("Episodes Finished:")
for episode in episode_finished[-5:]:
    print(f"[{episode[0] + 1:>5}] Total Reward: {episode[1]}")

In [None]:
def playEnvOnce(env, estimator):
    state = env.reset()
    
    for t in range(200):
        env.render()
        
        action = np.argmax(estimator.predict(state))
        
        next_state, reward, done, _ = env.step(action)
        
        if done:
            break
            
        state = next_state
        
    env.close()

In [None]:
playEnvOnce(env, estimator)