In [96]:
import gymnasium as gym
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import sklearn

In [97]:
env = gym.make("MountainCar-v0", render_mode="human")
observation, info = env.reset(seed=42)
for _ in range((100)):
   action = env.action_space.sample()  # this is where you would insert your policy
   observation, reward, terminated, truncated, info = env.step(action)
   if terminated or truncated:
      observation, info = env.reset()

env.close()

In [98]:
import numpy as np

class TileCoder:
    def __init__(self, num_tilings, num_tiles, bounds, action_space):
        self.num_tilings = num_tilings
        self.num_tiles = num_tiles
        self.bounds = bounds
        self.action_space = action_space

        self.position_tile_size = (bounds[0][1] - bounds[0][0]) / num_tiles
        self.velocity_tile_size = (bounds[1][1] - bounds[1][0]) / num_tiles

        self.position_tiling_offset = np.linspace(0, self.position_tile_size, num_tilings, endpoint=False)
        self.velocity_tiling_offset = np.linspace(0, self.velocity_tile_size, num_tilings, endpoint=False)

    def encode(self, state, action):
        position, velocity = state
        position_indices = []
        velocity_indices = []

        for i in range(self.num_tilings):
            position_tile_index = int((position - self.bounds[0][0] + self.position_tiling_offset[i]) / self.position_tile_size)
            velocity_tile_index = int((velocity - self.bounds[1][0] + self.velocity_tiling_offset[i]) / self.velocity_tile_size)

            position_indices.append(position_tile_index)
            velocity_indices.append(velocity_tile_index)

        # Include the action index
        action_index = action
        return tuple(position_indices + velocity_indices)

# Example usage:
tile_coder = TileCoder(num_tilings=8, num_tiles=8, bounds=[(-1.2, 0.6), (-0.07, 0.07)], action_space=3)
state = [-0.50991637, 0.00546296]
action = 2
encoded_state = tile_coder.encode(state, action)
print(encoded_state)


(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5)


In [99]:
import numpy as np

class MountainCarDiscretizer:
    def __init__(self, num_bins=20):
        self.num_bins = num_bins
        self.position_bins = np.linspace(-1.2, 0.6, num=num_bins - 1)
        self.velocity_bins = np.linspace(-0.07, 0.07, num=num_bins - 1)

    def discretize(self, state):
        position, velocity = state
        position_bin = np.digitize(position, self.position_bins)
        velocity_bin = np.digitize(velocity, self.velocity_bins)
        return np.array([position_bin, velocity_bin])

# Example usage:
discretizer = MountainCarDiscretizer(num_bins=50)

# Example state
state = [-0.5, 0.02]

# Discretize the state
discretized_state = discretizer.discretize(state)

print("Original state:", state)
print("Discretized state:", discretized_state)


Original state: [-0.5, 0.02]
Discretized state: [19 31]


In [100]:
tile_coder.encode([0.2, 0.07], 2)

(6, 6, 6, 6, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8)

In [101]:
def Q(s,a,W):
    
    return W[a].T@s 

def pi(s,W,epsilon=0.2):
    
    if np.random.rand() < epsilon:
        # Explore: choose a random action
        return np.random.choice(3)
    else:
        return np.argmax([Q(s,a,W) for a in range(3)])
 
def feature(observation):
    
    return np.array(tile_coder.encode(observation, 2))

def feature_discret(observation):
    return discretizer.discretize(observation)

In [102]:
env = gym.make("MountainCar-v0")
observation, info = env.reset(seed=42)
s=observation
a=action
W=np.zeros((3,2))
alpha=0.001
eps=0.2
gamma=1

In [103]:
observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)
s=scaler.transform((s.reshape(1, 2)))



In [104]:

for _ in range(5000):
    terminated=False
    while not (terminated):
#         s=scaler.transform((s.reshape(1, 2)))
        x=feature_discret(s.reshape(2,))
#         x=feature(s)
        a = pi(x,W,epsilon=0.2)  # this is where you would insert your policy
        sp, R, terminated, truncated, info = env.step(a) 
#         x=x.reshape(3,1)
        if terminated or truncated:
            W[a]+=(alpha*(R-Q(x,a,W))*x).astype(float)
            break
        #goto next episode
#         sp=scaler.transform((sp.reshape(1, 2)))
        xp=feature_discret(sp.reshape(2,))
#         xp=feature(sp)
        ap=pi(xp,W,epsilon=0.2)
        W[a]+=alpha*(R+(gamma*Q(xp,ap,W))-Q(x,a,W))*x
       
        
        s=sp
        a=ap
   

env.close()

In [105]:
W

array([[ 0.01226655, -0.04932258],
       [ 0.01251863, -0.04951416],
       [ 0.01083569, -0.04823512]])

In [108]:
env = gym.make("MountainCar-v0", render_mode="human")
observation, info = env.reset(seed=42)
for _ in range(500):
   x=feature_discret(observation) 
   action =  pi(x,W,epsilon=0)  # this is where you would insert your policy
   observation, reward, terminated, truncated, info = env.step(action)
   if terminated :
      observation, info = env.reset()

env.close()