In [3]:
from random import random
import gymnasium as gym
import numpy as np
import time
import math
import itertools

In [2]:
env=gym.make('MountainCar-v0', render_mode="human")
env.reset()
gamma = 0.99
n_eps=5
policy = lambda obs: 2 if obs[1] > 0 else 0
env.close()

In [None]:
class MC:
    def __init__(self, bins=(15,12), num_episodes=20000, min_lr=0.01, min_explore=0.01, discount=0.9, decay=15):
        self.bins = bins
        self.num_episodes = num_episodes
        self.min_lr = min_lr
        self.min_explore = min_explore
        self.discount = discount
        self.decay = decay
        self.env=gym.make('MountainCar-v0')
        self.upper_bounds = [self.env.observation_space.high[0], self.env.observation_space.high[1]]
        self.lower_bounds = [self.env.observation_space.low[0], self.env.observation_space.low[1]]
        self.Q_table = np.zeros(self.bins + (self.env.action_space.n,))
        m = self.discretize_state(np.array([-0.5,0.02]))
        print(m)
        
    def discretize_state(self, obs):
        discretized = list()
        for i in range(len(obs)):
            scaling = (obs[i] + abs(self.lower_bounds[i])) / (self.upper_bounds[i] - self.lower_bounds[i])
            new_obs = int(np.round((self.bins[i] - 1) * scaling))
            new_obs = min(self.bins[i] - 1, max(0, new_obs))
            discretized.append(new_obs)
        return tuple(discretized)
    
    def choose_action(self, state, flag=True):
        if flag:
            x = (np.random.uniform(0, 1))
            if  x < self.explore_rate:
                return self.env.action_space.sample()
            else:
                return np.argmax(self.Q_table[state])
        else:
            return np.argmax(self.Q_table[state])
    
    def get_explore_rate(self, t):
        return max(self.min_explore, min(1., 1. - math.log10((t + 1) / self.decay)))

    def get_lr(self, t):
        return max(self.min_lr, min(1., 1. - math.log10((t + 1) / self.decay)))
    
    def update_q(self,state, G, action, k):
        # +=learning_rate * (G - Q_table[state[0], state[1], action])
        q = self.Q_table[state][action]
        q+= self.lr * (1/k) *(G - q)
        self.Q_table[state][action] = q
        
    def train(self):
        episodes = []
        for i in range(self.num_episodes):
            print(f'episode {i+1}')
            episode = []
            k=1
            current_state = self.discretize_state(self.env.reset(seed=42)[0])
#             self.env.render()
            self.lr = self.get_lr(k)
            self.explore_rate = self.get_explore_rate(k)
            termintated, truncated = False,False
            while not any([termintated, truncated]):
                action = self.choose_action(current_state)
#                 print(current_state, action)
                obs, reward, termintated, truncated, _ = self.env.step(action)
#                 self.env.render()
                current_state = self.discretize_state(obs)
                episode.append([current_state,reward, action])
                k+=1
            G=0
            for i, info in enumerate(episode):
                i+=1
                G += (self.discount**i * info[1])
                self.update_q(info[0], G, info[2], i)
            episodes.append(episode)
        self.env.close()
    def run(self):
        self.env = gym.make('MountainCar-v0', render_mode='human')
        current_state = self.discretize_state(self.env.reset(seed=42)[0])
        termintated, truncated = False,False
        while not any([termintated, truncated]):
            action = self.choose_action(current_state, flag=False)
#                 print(current_state, action)
            obs, reward, termintated, truncated, _ = self.env.step(action)
            self.env.render()
            current_state = self.discretize_state(obs)
#                 episode.append([current_state,reward, action])
        self.env.close()
        
        
            

        
rl = MC()
rl.train()
rl.run()

(5, 7)
episode 1
episode 2
episode 3
episode 4
episode 5
episode 6
episode 7
episode 8
episode 9
episode 10
episode 11
episode 12
episode 13
episode 14
episode 15
episode 16
episode 17
episode 18
episode 19
episode 20
episode 21
episode 22
episode 23
episode 24
episode 25
episode 26
episode 27
episode 28
episode 29
episode 30
episode 31
episode 32
episode 33
episode 34
episode 35
episode 36
episode 37
episode 38
episode 39
episode 40
episode 41
episode 42
episode 43
episode 44
episode 45
episode 46
episode 47
episode 48
episode 49
episode 50
episode 51
episode 52
episode 53
episode 54
episode 55
episode 56
episode 57
episode 58
episode 59
episode 60
episode 61
episode 62
episode 63
episode 64
episode 65
episode 66
episode 67
episode 68
episode 69
episode 70
episode 71
episode 72
episode 73
episode 74
episode 75
episode 76
episode 77
episode 78
episode 79
episode 80
episode 81
episode 82
episode 83
episode 84
episode 85
episode 86
episode 87
episode 88
episode 89
episode 90
episode 91
e

In [7]:
rl.run()