<img src="./pics/DL.png" width=110 align="left" style="margin-right: 10px">

# Introduction to Deep Learning

## 09. Reinforcement Learning

---

# GYM random Mountain Car

In [None]:
import gym
env = gym.make('MountainCar-v0')
env.reset()
for step_index in range(1000):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print("Step {}:".format(step_index))
    print("action: {}".format(action))
    print("observation: {}".format(observation))
    print("reward: {}".format(reward))
    print("done: {}".format(done))
    print("info: {}".format(info))
    if done:
        break
env.close()

https://blog.tanka.la/2018/10/19/solving-curious-case-of-mountaincar-reward-problem-using-openai-gym-keras-tensorflow-in-python/
https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c
https://github.com/pylSER/Deep-Reinforcement-learning-Mountain-Car
https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

# NN

In [None]:
import random

import gym
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

env = gym.make('MountainCar-v0')
env.reset()
goal_steps = 200
score_requirement = -198
intial_games = 10000

def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 3)
            observation, reward, done, info = env.step(action)
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            if observation[0] > -0.2:
                reward = 1
            
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1, 0]
                elif data[1] == 0:
                    output = [1, 0, 0]
                elif data[1] == 2:
                    output = [0, 0, 1]
                training_data.append([data[0], output])
        
        env.reset()
    
    print(accepted_scores)
    
    return training_data


def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    model.fit(X, y, epochs=10)
    return model

In [None]:
training_data = model_data_preparation()
trained_model = train_model(training_data)

In [None]:
scores = []
choices = []
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = []
    for step_index in range(goal_steps):
        env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score += reward
        if done:
            break

    env.reset()
    scores.append(score)

env.close()
print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{} choice 2:{}'.format(choices.count(1)/len(choices),
                                                    choices.count(0)/len(choices),
                                                    choices.count(2)/len(choices)))

---

# DQN

In [None]:
import random

from collections import deque

import gym
import numpy as np

from tqdm.autonotebook import tqdm

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [None]:
class Success:

    def __init__(self, threshold=10):
        self.sum = 0
        self.last10 = []
        self.last10sum = sum(self.last10)
        self.threshold = threshold
    
    def __iadd__(self, value):
        self.sum += value
        self.last10.append(value)
        self.last10 = self.last10[-10:]
        self.last10sum = sum(self.last10)
        return self

    def __add__(self, value):
        new = Success()
        new.sum = self.sum
        new.last10 = self.last10
        new.last10sum = self.last10sum
        new.threshold = self.threshold
        return new.__iadd__(value)

    def __bool__(self):
        return sum(self.last10) >= self.threshold

In [None]:
class DQN:
    def __init__(self, env):
        self.env = env
        self.gamma = 0.99

        self.epsilon = 1
        self.epsilon_decay = 0.95  # 0.05
        self.epsilon_min = 0.01

        self.learing_rate = 0.001

        self.memory = deque(maxlen=20000)
        self.num_episodes = 400
        self.max_iter = 201  # max is 200
        self.batch_size = 32

        self.model = self.build_model()
        self.target_model = self.build_model()

        self.sync_models()

    def build_model(self):
        model = Sequential()
        state_shape = self.env.observation_space.shape
        action_shape = self.env.action_space.n

        model.add(Dense(24, activation='relu', input_shape=state_shape))
        model.add(Dense(48, activation='relu'))
        model.add(Dense(action_shape, activation='linear'))
        
        optimizer = Adam(learning_rate=self.learing_rate)
        model.compile(loss='mse', optimizer=optimizer)
        return model

    def sync_models(self):
        self.target_model.set_weights(self.model.get_weights())

    def act(self, state):
        if np.random.rand(1) < self.epsilon:
            return np.random.randint(0, self.env.action_space.n)
        return np.argmax(self.model.predict(state)[0])

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = max(self.epsilon_min, 
                               self.epsilon * self.epsilon_decay)

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def generate_batch(self):
        samples = np.array(random.sample(self.memory, self.batch_size))
        
        states, actions, rewards, new_states, dones = np.hsplit(samples, 5)

        states = np.concatenate(np.squeeze(states[:]), axis=0) # [batch_size x 2]
        new_states = np.concatenate(np.concatenate(new_states)) # [batch_size x 2]
        rewards = rewards.reshape(self.batch_size,).astype(float) # [batch_size]
        actions = actions.reshape(self.batch_size,).astype(int) # [batch_size]
        dones = dones.reshape(self.batch_size,).astype(bool) # [batch_size]
        notdones = (~dones).astype(float)
    
        return states, actions, rewards, new_states, notdones

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        states, actions, rewards, new_states, notdones = self.generate_batch()
        targets = self.model.predict(states)
        indices = np.arange(self.batch_size)
        Q_futures = self.target_model.predict(new_states).max(axis = 1)
        targets[(indices, actions)] = rewards + Q_futures * self.gamma * notdones
        
        self.model.fit(states, targets, epochs=1, verbose=0)

    def optimize_model(self, state, eps, render=True):
        score = 0
        max_position = -99

        for i in range(self.max_iter):
            action = self.act(state)

            # Show the animation every 50 eps
            if render and eps % 50 == 0:
                env.render()

            new_state, reward, done, _ = env.step(action)
            new_state = new_state.reshape(1, 2)

            # Keep track of max position
            position = new_state[0][0]
            if position > max_position:
                max_position = position

            # Adjust reward for task completion
            if position >= 0.5:
                reward += 10

            self.remember(state, action, reward, new_state, done)
            self.replay()

            state = new_state
            score += reward

            if done:
                break

        self.sync_models()
        self.decay_epsilon()
        
        return i < 199

    def fit(self, render=True):
        successes = []
        success = Success()

        episodes = tqdm(range(self.num_episodes))
        for eps in episodes:
            state = env.reset().reshape(1, 2)
            success += self.optimize_model(state, eps, render)

            episodes.set_postfix_str(f'overall: {success.sum}, '
                                     f'last10: {success.last10sum}')
            if success:
                print(f'10 success in a row, stopping early at episode {eps}.')
                episodes.close()
                break
        
        return self

                
def play(env, model, n=1):
    for _ in range(n):
        done = False
        state = env.reset().reshape(1, 2)
        while not done:
            env.render()
            action = model.act(state)
            new_state, reward, done, info = env.step(action)
            state = new_state.reshape(1, 2)
    env.close()

In [None]:
env = gym.make('MountainCar-v0')            
            
env.seed(42)
random.seed(42)
np.random.seed(42)

dqn = DQN(env=env).fit(render=False)

env.close()

In [None]:
play(env, dqn, 10)