from [here](https://github.com/moskomule/pytorch.rl.learning) <br>
[another one](https://github.com/vikasjiitk/Deep-RL-Mountain-Car/blob/master/MCqlearn.py)

In [3]:
%reload_ext autoreload
%autoreload 2
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv
import torch
import random
import numpy as np
import math
from EXITrl.table_base import TableBase
from EXITrl.approximation_base import ApproximationBase, ExperienceReplay
env = GridworldEnv()

### Sarsa (Table)

In [132]:
class Sarsa(TableBase):
    def __init__(self, env, num_episodes, epsilon=0.1, alpha=0.5, gamma=.9):
        super().__init__(env, num_episodes, epsilon, alpha, gamma)

    def _loop(self, episode) -> int:
        policy = self.epsilon_greedy
        state = self.env.reset()
        action = policy(state)
        total_reward = 0
        done = False
        while not done:
            state_, reward, done, _= self.env.step(action)
            action_ = policy(state_)
            ########## CORE Algorithm #########
            if done: 
                td_target = reward
            else: 
                td_target = reward + self.gamma * self.Q[state_, action_]
            td_error = td_target - self.Q[state][action]
            self.Q[state][action] += self.alpha * td_error
            ###################################
            total_reward += reward
            state = state_
            action = action_
        return total_reward
s = Sarsa(env, 50)
s.train()
s.convert_Q_to_V()

array([[ 0,  0, -1, -2],
       [ 0, -1, -2, -1],
       [-1, -1, -1,  0],
       [-2, -1,  0,  0]])

### Sarsa lambda (Table)

In [129]:
class SarsaLambda(TableBase):
    def __init__(self, env, num_episodes, epsilon=0.1, alpha=0.5, gamma=.9, lambd=0.1):
        super().__init__(env, num_episodes, epsilon, alpha, gamma, lambd)
        self.Z = self.Q.clone()

    def _loop(self, episode) -> int:
        policy = self.epsilon_greedy
        state = self.env.reset()
        action = policy(state)
        self.Z.zero_()
        total_reward = 0
        done = False
        while not done:
            state_, reward, done, _ = self.env.step(action)
            action_ = policy(state_)
            ########## CORE Algorithm #########
            if done: 
                td_target = reward
            else: 
                td_target = reward + self.gamma * self.Q[state_, action_]
            td_error = td_target - self.Q[state, action]
            self.Z[state, action] += 1
            self.Q += self.alpha * td_error * self.Z
            self.Z = self.gamma * self.lambd * self.Z
            ###################################
            total_reward += reward
            state = state_
            action = action_
        return total_reward
s = SarsaLambda(env, 510)
s.train()
s.convert_Q_to_V()

array([[ 0, -1, -2, -2],
       [-1, -1, -2, -1],
       [-2, -2, -1, -1],
       [-2, -1, -1,  0]])

### Sarsa Approximtion (Grid World)

In [7]:
class GridworldEnv2DState(GridworldEnv):
    def __init__(self, shape=[4, 4]):
        super().__init__(shape)
        
    def convert_to_2_dimension_state(self, state):
        return np.array([math.floor(state/4), state%4], dtype=int)
    
    def reset(self):
        return self.convert_to_2_dimension_state(super(GridworldEnv, self).reset())
    
    def step(self, action):
        state, reward, done, info = super(GridworldEnv, self).step(action)
        state = self.convert_to_2_dimension_state(state)
        return state, reward, done, info
    
env = GridworldEnv2DState()

In [5]:
class SarsaApproximation(ApproximationBase):
    def __init__(self, 
                 env, 
                 num_state, 
                 num_action, 
                 num_episodes, 
                 num_experience=100, 
                 epsilon=0.01, 
                 alpha=0.008, 
                 gamma=.9):
        super().__init__(env, 
                         num_state, 
                         num_action, 
                         num_episodes, 
                         epsilon, 
                         alpha, 
                         gamma)
        if num_experience==1:
            self.update_experience = self.update_step_by_step_experience
        else:
            self.experience_replay = ExperienceReplay(num_experience) 
            self.update_experience = self.update_experience_replay
    
    def update_step_by_step_experience(self, state, action, reward, state_, action_, done):
        if done:
            td_target = torch.Tensor(np.array(reward))
        else:
            td_target = reward + self.gamma * self.approximate_q(state_)[action_]
        predict_q = self.approximate_q(state)[action]
        self.update_weight(td_target, predict_q)
    
    def update_experience_replay(self, state, action, reward, state_, action_, done):
        def get_target(state, action, reward, state_, action_, done):
            if done:
                td_target = torch.Tensor(np.array(reward))
            else:
                td_target = reward + self.gamma * self.approximate_q(state_)[action_]
            predict_q = self.approximate_q(state)[action]
            return td_target, predict_q

        self.experience_replay.remember(state, action, reward, state_, action_, done)
        targets, predict_qs = self.experience_replay.get_batch(get_target)
        self.update_weight(targets, predict_qs)
        
    def _loop(self, episode) -> int:
        policy = self.epsilon_greedy
        done = False
        total_reward, reward = 0, 0
        state = self.env.reset()
        action = policy(state)
        while not done:
            state_, reward, done, _ = self.env.step(action)
            action_ = policy(state_)
            self.update_experience(state, action, reward, state_, action_, done)
            total_reward += reward
            state = state_
            action = action_
        return total_reward
    
    def convert_Q_to_V(self):
        V = np.array([0.]*self.env.observation_space.n)
        for state in range(self.env.observation_space.n):
            convert_state = env.convert_to_2_dimension_state(state)
            print(convert_state, self.approximate_q(convert_state).detach().numpy())
            V[state] = self.approximate_q(convert_state).max().item()
        return V.reshape(self.env.shape)
        
s = SarsaApproximation(env, 
                       num_state=2, 
                       num_action=env.action_space.n, 
                       num_episodes=50,
                       epsilon=0.01, 
                       alpha=0.008, 
                       gamma=.9)
s.train(True)
s.convert_Q_to_V()

TypeError: expected np.ndarray (got numpy.int64)

### Test nn by Q from Table base

In [8]:
Q = np.array([[ 0.0000,  0.0000,  0.0000,  0.0000],
            [-1.6439, -1.4790, -1.3537, -0.9999],
            [-2.1910, -2.2997, -2.0220, -1.9134],
            [-2.8211, -2.6443, -2.3472, -2.4686],
            [-0.9999, -1.0780, -1.4980, -1.6079],
            [-1.8345, -1.8612, -1.7473, -1.6657],
            [-1.8404, -2.1300, -2.0000, -2.1527],
            [-2.0695, -2.3770, -1.8677, -2.0256],
            [-1.8623, -2.0250, -2.4303, -2.0676],
            [-2.1694, -1.8296, -2.3622, -1.9963],
            [-1.9552, -1.6668, -1.4604, -1.7984],
            [-1.0469, -1.5610, -0.9980, -1.0685],
            [-2.4361, -2.4313, -2.4637, -2.7323],
            [-1.9719, -1.8673, -2.0908, -2.6633],
            [-1.4525, -0.9980, -1.5573, -1.9203],
            [ 0.0000,  0.0000,  0.0000,  0.0000]])
env = GridworldEnv2DState()
s = SarsaApproximation(env, 
                       num_state=2, 
                       num_action=env.action_space.n, 
                       num_episodes=50,
                       epsilon=0.01, 
                       alpha=0.008, 
                       gamma=.9)
for _ in range(500):
    idx = np.random.randint(Q.shape[0])
    action = np.random.randint(4)
    state = np.array([math.floor(idx/4), idx%4], dtype=int)
    td_target = Q[idx, action]
    s.update_weight(td_target, s.approximate_q(state)[action])
s.convert_Q_to_V()

[0 0] [-0.39613077 -0.03714038  0.13694015 -0.3392189 ]
[0 1] [-1.6346174 -1.6044319 -1.0119783 -1.3353143]
[0 2] [-2.2975006 -2.5693913 -1.4769734 -2.0587354]
[0 3] [-2.7724636 -3.3577216 -1.7400581 -2.6508527]
[1 0] [-1.3514197 -1.4272399 -1.3393623 -1.5845333]
[1 1] [-1.5627629 -1.4453161 -1.2919213 -1.5040332]
[1 2] [-1.8155991 -1.8279312 -1.2653239 -1.7701101]
[1 3] [-2.1938133 -2.4766972 -1.4244022 -2.2850974]
[2 0] [-2.0016148 -2.2187886 -2.046148  -2.2909245]
[2 1] [-1.6934061 -1.5435145 -1.548365  -1.8205878]
[2 2] [-1.5272017 -1.1294053 -1.0528605 -1.4390197]
[2 3] [-1.6833905 -1.4549936 -0.9649601 -1.63746  ]
[3 0] [-2.3912418 -2.6372817 -2.4151344 -2.8316402]
[3 1] [-2.0485532 -1.9206715 -1.946922  -2.3814125]
[3 2] [-1.6947435 -1.2301654 -1.3014444 -1.7872014]
[3 3] [-1.4059881 -0.7419831 -0.7568294 -1.3104312]


array([[ 0.13694015, -1.01197827, -1.47697341, -1.74005806],
       [-1.33936226, -1.29192126, -1.26532388, -1.42440224],
       [-2.00161481, -1.54351449, -1.0528605 , -0.9649601 ],
       [-2.39124179, -1.92067146, -1.23016536, -0.74198312]])

[CartPole wiki](https://github.com/openai/gym/wiki/CartPole-v0)

### Sarsa Aproximation (CartPole)

In [14]:
import gym
env = gym.make('CartPole-v1')
s = SarsaApproximation(env, 
                       num_state=env.observation_space.shape[0],
                       num_action=env.action_space.n, 
                       num_episodes=50,
                       num_experience=128,
                       epsilon=0.01, 
                       alpha=0.007, 
                       gamma=.99)
s.train(True)


  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode: 0 reward: 12.0
episode: 1 reward: 22.0
episode: 2 reward: 21.0
episode: 3 reward: 10.0
episode: 4 reward: 17.0
episode: 5 reward: 34.0
episode: 6 reward: 66.0
episode: 7 reward: 67.0
episode: 8 reward: 19.0
episode: 9 reward: 47.0
episode: 10 reward: 65.0
episode: 11 reward: 9.0
episode: 12 reward: 23.0
episode: 13 reward: 11.0
episode: 14 reward: 16.0
episode: 15 reward: 40.0
episode: 16 reward: 42.0
episode: 17 reward: 42.0
episode: 18 reward: 26.0
episode: 19 reward: 59.0
episode: 20 reward: 96.0
episode: 21 reward: 28.0
episode: 22 reward: 26.0
episode: 23 reward: 37.0
episode: 24 reward: 32.0
episode: 25 reward: 43.0
episode: 26 reward: 52.0
episode: 27 reward: 104.0
episode: 28 reward: 158.0
episode: 29 reward: 149.0
episode: 30 reward: 164.0
episode: 31 reward: 135.0
episode: 32 reward: 99.0
episode: 33 reward: 229.0
episode: 34 reward: 14.0
episode: 35 reward: 16