In [None]:
#Cart Pole Balancing with Random Policy

In [3]:
pip install gym

Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
      --------------------------------------- 10.2/721.7 kB ? eta -:--:--
     - ----------------------------------- 30.7/721.7 kB 435.7 kB/s eta 0:00:02
     --- --------------------------------- 61.4/721.7 kB 656.4 kB/s eta 0:00:02
     ---- -------------------------------- 92.2/721.7 kB 525.1 kB/s eta 0:00:02
     ------ ----------------------------- 122.9/721.7 kB 554.9 kB/s eta 0:00:02
     -------- --------------------------- 163.8/721.7 kB 701.4 kB/s eta 0:00:01
     ---------- ------------------------- 204.8/721.7 kB 692.4 kB/s eta 0:00:01
     ---------- ------------------------- 215.0/721.7 kB 624.4 kB/s eta 0:00:01
     ----------- ------------------------ 235.5/721.7 kB 654.9 kB/s eta 0:00:01
     ------------- ---------------------- 266.2/721.7 kB 630.5 kB/s eta 0:00:01
    

In [5]:
import gym

# Create the environment
env = gym.make("CartPole-v1")

# Number of episodes to run
num_episodes = 10

for episode in range(num_episodes):
    observation, _ = env.reset()  # Reset the environment
    done = False
    total_reward = 0

    while not done:
        env.render()  # Render the environment (optional)
        action = env.action_space.sample()  # Choose a random action (0 or 1)
        observation, reward, done, truncated, info = env.step(action)  # Take action
        total_reward += reward

    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

env.close()


Episode 1: Total Reward = 17.0
Episode 2: Total Reward = 30.0
Episode 3: Total Reward = 21.0
Episode 4: Total Reward = 15.0
Episode 5: Total Reward = 30.0
Episode 6: Total Reward = 24.0
Episode 7: Total Reward = 15.0
Episode 8: Total Reward = 16.0
Episode 9: Total Reward = 14.0
Episode 10: Total Reward = 10.0


In [None]:
#Unified Notation for Episodic and Continuing Tasks
#the below The random policy performs poorly; proper RL training would improve performance.)

In [1]:
pip install gym numpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import gym
import numpy as np

def run_rl_task(env_name, gamma=0.99, num_episodes=10):
    """
    Runs an RL environment using a random policy and calculates discounted return.
    
    Parameters:
    - env_name: The Gym environment name (e.g., "CartPole-v1", "MountainCarContinuous-v0").
    - gamma: Discount factor (0 ≤ γ ≤ 1).
    - num_episodes: Number of episodes to run.
    """
    env = gym.make(env_name)
    episodic = env.spec.max_episode_steps is not None  # Detect if task is episodic

    for episode in range(num_episodes):
        state, _ = env.reset()  # Reset environment
        done = False
        total_reward = 0
        rewards = []
        step = 0

        while not done:
            env.render()  # Visualize environment (optional)
            action = env.action_space.sample()  # Take random action
            state, reward, done, truncated, _ = env.step(action)
            rewards.append(reward)
            total_reward += reward
            step += 1

            if episodic and (done or truncated):  # End if episodic
                break

        # Compute discounted return
        G = 0
        for t in reversed(range(len(rewards))):
            G = rewards[t] + gamma * G

        print(f"Episode {episode + 1}: Total Reward = {total_reward}, Steps = {step}, Discounted Return = {G:.2f}")

    env.close()

# Run episodic task (CartPole)
print("\nRunning Episodic Task: CartPole-v1")
run_rl_task("CartPole-v1", gamma=0.99, num_episodes=5)

# Run continuing task (MountainCarContinuous)
print("\nRunning Continuing Task: MountainCarContinuous-v0")
run_rl_task("MountainCarContinuous-v0", gamma=0.99, num_episodes=5)



Running Episodic Task: CartPole-v1
Episode 1: Total Reward = 26.0, Steps = 26, Discounted Return = 23.00
Episode 2: Total Reward = 11.0, Steps = 11, Discounted Return = 10.47
Episode 3: Total Reward = 11.0, Steps = 11, Discounted Return = 10.47
Episode 4: Total Reward = 19.0, Steps = 19, Discounted Return = 17.38
Episode 5: Total Reward = 28.0, Steps = 28, Discounted Return = 24.53

Running Continuing Task: MountainCarContinuous-v0
Episode 1: Total Reward = -33.06091481609596, Steps = 999, Discounted Return = -3.13
Episode 2: Total Reward = -33.939005153118906, Steps = 999, Discounted Return = -3.27
Episode 3: Total Reward = -33.339596986669726, Steps = 999, Discounted Return = -3.19
Episode 4: Total Reward = -31.146623868199548, Steps = 999, Discounted Return = -2.86


  gym.logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):
  gym.logger.warn(


Episode 5: Total Reward = -33.20020581169291, Steps = 999, Discounted Return = -3.34


In [None]:
#Policies and Value Functions
#The optimal policy guides the agent toward the goal.
#The value function estimates the expected return from each state.

In [5]:
import numpy as np
import gym

def evaluate_policy(env, policy, gamma=0.99, theta=1e-6):
    """
    Computes the state-value function V(s) for a given policy using iterative policy evaluation.
    """
    V = np.zeros(env.observation_space.n)  # Initialize value function
    while True:
        delta = 0  # Track convergence
        for s in range(env.observation_space.n):
            v = 0
            for a, action_prob in enumerate(policy[s]):
                for prob, next_state, reward, done in env.P[s][a]:
                    v += action_prob * prob * (reward + gamma * V[next_state] * (not done))
            delta = max(delta, abs(V[s] - v))
            V[s] = v
        if delta < theta:  # Stop when values converge
            break
    return V

def improve_policy(env, V, gamma=0.99):
    """
    Computes a new greedy policy using the updated value function V(s).
    """
    policy = np.zeros((env.observation_space.n, env.action_space.n))  # Initialize new policy
    for s in range(env.observation_space.n):
        q_values = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state] * (not done))
        best_action = np.argmax(q_values)
        policy[s] = np.eye(env.action_space.n)[best_action]  # One-hot encoding
    return policy

def policy_iteration(env, gamma=0.99, max_iterations=1000):
    """
    Policy Iteration: Alternates between policy evaluation and improvement.
    """
    policy = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n  # Initialize random policy
    for i in range(max_iterations):
        V = evaluate_policy(env, policy, gamma)
        new_policy = improve_policy(env, V, gamma)
        if np.all(policy == new_policy):  # Stop if policy is stable
            break
        policy = new_policy
    return policy, V

def value_iteration(env, gamma=0.99, theta=1e-6):
    """
    Value Iteration: Computes the optimal value function and policy.
    """
    V = np.zeros(env.observation_space.n)  # Initialize values
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            q_values = np.zeros(env.action_space.n)
            for a in range(env.action_space.n):
                for prob, next_state, reward, done in env.P[s][a]:
                    q_values[a] += prob * (reward + gamma * V[next_state] * (not done))
            max_q = np.max(q_values)
            delta = max(delta, abs(V[s] - max_q))
            V[s] = max_q
        if delta < theta:  # Stop if values converge
            break

    # Derive optimal policy from the optimal value function
    policy = np.zeros((env.observation_space.n, env.action_space.n))
    for s in range(env.observation_space.n):
        q_values = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state] * (not done))
        best_action = np.argmax(q_values)
        policy[s] = np.eye(env.action_space.n)[best_action]  # One-hot encoding

    return policy, V

# Initialize FrozenLake environment
env = gym.make("FrozenLake-v1", is_slippery=True)

# Perform Policy Iteration
print("\n🔹 Running Policy Iteration...")
optimal_policy_pi, optimal_value_pi = policy_iteration(env)
print("Optimal Policy (Policy Iteration):")
print(optimal_policy_pi)
print("Optimal Value Function:")
print(optimal_value_pi)

# Perform Value Iteration
print("\n🔹 Running Value Iteration...")
optimal_policy_vi, optimal_value_vi = value_iteration(env)
print("Optimal Policy (Value Iteration):")
print(optimal_policy_vi)
print("Optimal Value Function:")
print(optimal_value_vi)



🔹 Running Policy Iteration...
Optimal Policy (Policy Iteration):
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function:
[0.54201383 0.49878715 0.47067694 0.45683158 0.55844021 0.
 0.35833998 0.         0.59178998 0.64307352 0.61520205 0.
 0.         0.7417161  0.86283524 0.        ]

🔹 Running Value Iteration...
Optimal Policy (Value Iteration):
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function:
[0.54201404 0.49878743 0.47067727 0.45683193 0.5584404  0.
 0.35834012 0.         0.59179013 0.64307363 0.61520214 0.
 0.         0.74171617 0.86283528 0.        ]


In [None]:
#Optimal Policies and Optimal Value Functions

In [9]:
import numpy as np
import gym

def value_iteration(env, gamma=0.99, theta=1e-6):
    """
    Performs Value Iteration to find the optimal value function and policy.
    """
    V = np.zeros(env.observation_space.n)  # Initialize value function
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            q_values = np.zeros(env.action_space.n)
            for a in range(env.action_space.n):
                for prob, next_state, reward, done in env.P[s][a]:
                    q_values[a] += prob * (reward + gamma * V[next_state] * (not done))
            max_q = np.max(q_values)
            delta = max(delta, abs(V[s] - max_q))
            V[s] = max_q  # Update value function

        if delta < theta:  # Convergence check
            break

    # Derive optimal policy from the optimal value function
    policy = np.zeros((env.observation_space.n, env.action_space.n))
    for s in range(env.observation_space.n):
        q_values = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state] * (not done))
        best_action = np.argmax(q_values)
        policy[s] = np.eye(env.action_space.n)[best_action]  # One-hot encoding

    return policy, V

# Initialize FrozenLake environment
env = gym.make("FrozenLake-v1", is_slippery=True)

# Perform Value Iteration
print("\n🔹 Running Value Iteration...")
optimal_policy, optimal_value = value_iteration(env)

print("Optimal Policy:")
print(optimal_policy)
print("Optimal Value Function:")
print(optimal_value)



🔹 Running Value Iteration...
Optimal Policy:
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function:
[0.54201404 0.49878743 0.47067727 0.45683193 0.5584404  0.
 0.35834012 0.         0.59179013 0.64307363 0.61520214 0.
 0.         0.74171617 0.86283528 0.        ]


In [None]:
#Optimality and Approximation

In [31]:
pip install gym numpy torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Define the Deep Q-Network
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Define the Deep Q-Learning Agent
class DQNAgent:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=0.001, batch_size=64, memory_size=10000):
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        self.model = DQN(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        self.action_dim = action_dim

    def choose_action(self, state, epsilon=0.1):
        if np.random.rand() < epsilon:
            return np.random.choice(self.action_dim)  # Random action (exploration)
        with torch.no_grad():
            return torch.argmax(self.model(torch.FloatTensor(state))).item()  # Greedy action

    def store_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.memory) < self.batch_size:
            return  # Not enough experiences to train
        
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        q_values = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.model(next_states).max(1)[0].detach()
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        loss = self.criterion(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

# Training Loop
env = gym.make("CartPole-v1")
agent = DQNAgent(state_dim=4, action_dim=2)

num_episodes = 500
epsilon = 1.0  # Initial exploration rate
epsilon_decay = 0.995
min_epsilon = 0.01

for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False
    
    while not done:
        action = agent.choose_action(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        agent.store_experience(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        agent.train()

    epsilon = max(min_epsilon, epsilon * epsilon_decay)  # Decay epsilon
    print(f"Episode {episode+1}: Reward = {total_reward}")

env.close()

  if not isinstance(terminated, (bool, np.bool8)):
  states = torch.FloatTensor(states)


Episode 1: Reward = 23.0
Episode 2: Reward = 9.0
Episode 3: Reward = 13.0
Episode 4: Reward = 17.0
Episode 5: Reward = 25.0
Episode 6: Reward = 18.0
Episode 7: Reward = 17.0
Episode 8: Reward = 15.0
Episode 9: Reward = 17.0
Episode 10: Reward = 33.0
Episode 11: Reward = 39.0
Episode 12: Reward = 31.0
Episode 13: Reward = 20.0
Episode 14: Reward = 10.0
Episode 15: Reward = 42.0
Episode 16: Reward = 21.0
Episode 17: Reward = 15.0
Episode 18: Reward = 52.0
Episode 19: Reward = 13.0
Episode 20: Reward = 30.0
Episode 21: Reward = 40.0
Episode 22: Reward = 19.0
Episode 23: Reward = 20.0
Episode 24: Reward = 15.0
Episode 25: Reward = 15.0
Episode 26: Reward = 16.0
Episode 27: Reward = 23.0
Episode 28: Reward = 18.0
Episode 29: Reward = 14.0
Episode 30: Reward = 12.0
Episode 31: Reward = 12.0
Episode 32: Reward = 11.0
Episode 33: Reward = 18.0
Episode 34: Reward = 29.0
Episode 35: Reward = 19.0
Episode 36: Reward = 14.0
Episode 37: Reward = 38.0
Episode 38: Reward = 10.0
Episode 39: Reward = 1