In [None]:
! py -m pip install --upgrade pip
! pip install wheel setuptools pip --upgrade
! pip install swig
! pip install gymnasium[box2d]
! pip install moviepy
! pip install numpy
! pip install -U matplotlib

# Remarks

If the video does not work, it is advisable to restart the entire kernel after the initial installation.

In [None]:
import gymnasium
from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder

env = gymnasium.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5,
    render_mode = "rgb_array"
)

observation, info = env.reset()
videoPath = "video.mp4"
video = VideoRecorder(env, videoPath)

for _ in range(200):
    env.render()
    video.capture_frame()
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    print(observation, reward, terminated, truncated, info)

video.close()
env.close()

# Training the Lunar Lander

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import itertools

In [None]:
env = gym.make(
        "LunarLander-v2",
        continuous = False,
        gravity = -10.0,
        enable_wind = False,
        wind_power = 15.0,
        turbulence_power = 1.5,
        render_mode = "rgb_array"
)

# Lower and higher limits of observation space
observationSpaceLow = (-1.5, -1.5, -5., -5., -3.1415927, -5., -0., -0.)
observationSpaceHigh = (1.5, 1.5, 5., 5., 3.1415927, 5., 1., 1.)
observationSpace = list(zip(observationSpaceLow,observationSpaceHigh))

# Create bins/intervals for observations
def makeIntervals(observationSpace, intervals=10):
    observationsInterval = []
    
    for observation in observationSpace[:-2]:
        observationInterval = []
        stepSize = (observation[1] - observation[0]) / intervals
        
        for step in np.arange(observation[0], observation[1]+stepSize, stepSize):
            observationInterval.append(step)
        
        observationsInterval.append(observationInterval)
        
    for _ in range(2):
        observationsInterval.append([0., 1.])
    
    return observationsInterval

def makeQTable(observationsInterval, n_action):
    QTable = {}
    permutations = list(itertools.product(*observationsInterval))
    for permutation in permutations:
        QTable[permutation] = [0.] * n_action

    return QTable

# Check state
def checkInterval(observations, observationsInterval, observationSpace):
    observationsKey = ()

    # For each observation
    for index, observation in enumerate(observations[:6]):
        
        # Process Angle
        if index == 4 or index == 5:
            observation = observation % 3.1415927,

        # Clip Limits
        clipObservation = np.clip(observation, observationSpace[index][0], observationSpace[index][1])
        
        for lower, upper in zip(observationsInterval[index][:-1], observationsInterval[index][1:]):
            if lower <= clipObservation < upper or clipObservation == upper:
                observationsKey += (lower,)
                break

    left, right = observations[-2:]
    observationsKey += (left, right,)

    return observationsKey

In [None]:
observationsInterval =  makeIntervals(observationSpace, intervals=3)
QTable = makeQTable(observationsInterval, env.action_space.n)
len(QTable)

In [None]:
def greedyEPS(epsStart, epsEnd, epsDecay, step):
    return epsEnd + (epsStart - epsEnd) * np.exp(-1 * step / epsDecay)

# Example of EPS
epsStart = 1.0
epsEnd = 0.0
epsDecay = 70
step = np.arange(0, 100, 1)
eps = greedyEPS(epsStart, epsEnd, epsDecay, step)

plt.plot(step, eps)
plt.xlabel("Steps")
plt.ylabel("EPS")

In [None]:
epsStart = 1.0
epsEnd = 0.0
epsDecay = 70
learningRate = 0.1
discountedEstimate = 0.95
episodes = 100000
maxSteps = 100

for episode in range(episodes):
    print("Starting Episode: ", episode)
    currentObservations, _ = env.reset()

    for step in range(maxSteps):
        eps = greedyEPS(epsStart, epsEnd, epsDecay, episode)
        action = env.action_space.sample() if np.random.random() < eps else np.argmax(QTable[checkInterval(currentObservations, observationsInterval, observationSpace)])
        newObservations, reward, termination, truncation, info = env.step(action)
        currentObservationsKey = checkInterval(currentObservations, observationsInterval, observationSpace)
        newObservationsKey = checkInterval(newObservations, observationsInterval, observationSpace)
        QTable[currentObservationsKey][action] = QTable[currentObservationsKey][action] + learningRate * (reward + 0.95 * np.max(QTable[newObservationsKey]) - QTable[currentObservationsKey][action])

        if termination or truncation:
            break

        currentObservations = newObservations    

In [None]:
import gymnasium
from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder

env = gymnasium.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5,
    render_mode = "rgb_array"
)

observation, info = env.reset()
videoPath = "video.mp4"
video = VideoRecorder(env, videoPath)

for step in range(200):
    env.render()
    video.capture_frame()
    action = np.argmax(QTable[checkInterval(currentObservations, observationsInterval, observationSpace)])
    newObservation, reward, termination, truncation, info = env.step(action)
    
    if termination or truncation:
        break

    observation = newObservation

video.close()
env.close()

In [None]:
len(QTable)