In [3]:
import gymnasium as gym
from collections import defaultdict
import skyscraper
import numpy as np

In [4]:
# Create the environment
env = gym.make("MountainCar-v0", render_mode="human")  # "human" for visualization


def q_learning(env, alpha=0.1, gamma=0.9, epsilon=0.1, episodes=5000):
    """
    Q-Learning Implementation with average reward tracking.
    """
    Q = defaultdict(float)
    policy = defaultdict(lambda: np.random.choice(env.action_space.n))
    total_rewards = []

    for episode in range(episodes):
        observation, _ = env.reset()
        state = tuple(observation['agent']['pos'])
        total_reward = 0
        done = False

        while not done:
            action = policy[state] if np.random.rand() > epsilon else np.random.choice(env.action_space.n)
            next_obs, reward, done, _, _ = env.step(action)
            next_state = tuple(next_obs['agent']['pos'])
    
            best_next_action = np.argmax([Q[(next_state, a)] for a in range(env.action_space.n)])
            Q[(state, action)] += alpha * (reward + gamma * Q[(next_state, best_next_action)] - Q[(state, action)])

            actions = [Q[(state, a)] for a in range(env.action_space.n)]
            policy[state] = np.argmax(actions)

            state = next_state
            total_reward += reward

        total_rewards.append(total_reward)

        if episode % 100 == 0:
            avg_reward = np.mean(total_rewards[-100:])
            print(f"Episode {episode}: Epsilon = {epsilon:.4f}, Avg Reward (last 100 episodes) = {avg_reward:.2f}")

    print(f"\n📊 Average Reward over all episodes: {np.mean(total_rewards):.2f}")
    return policy, Q