In [None]:
!pip install gym
!pip install box2d-py

import gym
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Create the BipedalWalker-v3 environment
env = gym.make("BipedalWalker-v3", hardcore=True)

# Dynamics Value Iteration

def dynamics_value_iteration(env, discount_factor=0.9, theta=1e-9):
    # Get the number of states and actions
    nS = env.observation_space.shape[0]
    nA = env.action_space.shape[0]

    # Initialize the value function
    V = np.zeros(nS)

    # Initialize the list to store the reward curve
    reward_curve = []

    while True:
        delta = 0
        total_reward = 0

        # Iterate over all states
        for s in range(nS):
            v = V[s]
            q_values = np.zeros(nA)

            # Compute Q-value for each action
            for a in range(nA):
                next_state, reward, _, _ = env.step(a)
                total_reward += reward
                q_values[a] = reward + discount_factor * V[next_state]

            # Update the value function
            V[s] = np.max(q_values)
            delta = max(delta, np.abs(v - V[s]))

        # Store the total reward in each iteration
        reward_curve.append(total_reward)

        if delta < theta:
            break

    return V, reward_curve

# Approximate Dynamics Programming using Polynomial Features

def approximate_dynamics_programming(env, discount_factor=0.9):
    # Get the number of states and actions
    nS = env.observation_space.shape[0]
    nA = env.action_space.shape[0]

    # Generate features for all states
    all_states = env.observation_space.sample().reshape(1, -1)
    features = generate_features(all_states)

    # Initialize the value function
    V = np.zeros(nS)

    # Initialize the list to store the reward curve
    reward_curve = []

    while True:
        # Generate targets for each state
        targets = np.zeros(nS)
        total_reward = 0

        # Iterate over all states
        for s in range(nS):
            q_values = np.zeros(nA)

            # Compute Q-value for each action
            for a in range(nA):
                next_state, reward, _, _ = env.step(a)
                total_reward += reward
                features_next = generate_features(next_state.reshape(1, -1))
                q_values[a] = reward + discount_factor * np.max(np.dot(features_next, weights))

            # Update the targets
            targets[s] = np.max(q_values)

        # Fit a linear regression model
        regression_model = LinearRegression()
        regression_model.fit(features, targets)
        weights = regression_model.coef_

        # Store the total reward in each iteration
        reward_curve.append(total_reward)

        if np.allclose(V, targets):
            break

    return V, reward_curve

# Deep Function Approximation of Dynamics Programming

class ValueNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ValueNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def deep_function_approximation(env, discount_factor=0.9, learning_rate=0.01, num_epochs=100):
    # Get the number of states and actions
    nS = env.observation_space.shape[0]
    nA = env.action_space.shape[0]

    # Generate features for all states
    all_states = env.observation_space.sample().reshape(1, -1)
    features = generate_features(all_states)

    # Create the value network
    input_dim = features.shape[1]
    output_dim = nA
    value_net = ValueNet(input_dim, output_dim)

    # Define the loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(value_net.parameters(), lr=learning_rate)

    # Convert features and values to tensors
    features_tensor = torch.tensor(features, dtype=torch.float32)
    values_tensor = torch.tensor(approximate_values, dtype=torch.float32).unsqueeze(1)

    # Initialize the list to store the reward curve
    reward_curve = []

    # Train the value network
    for epoch in range(num_epochs):
        # Forward pass
        values_pred = value_net(features_tensor)

        # Compute the loss
        loss = criterion(values_pred, values_tensor)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute the total reward
        total_reward = 0
        for _ in range(10):
            state = env.reset()
            done = False
            while not done:
                action = value_net(torch.tensor(generate_features(state.reshape(1, -1)), dtype=torch.float32)).argmax().item()
                state, reward, done, _ = env.step(action)
                total_reward += reward

        # Store the total reward in each epoch
        reward_curve.append(total_reward)

    # Extract the values from the value network
    with torch.no_grad():
        values_pred = value_net(features_tensor).numpy().flatten()

    return values_pred, reward_curve

# Generate polynomial features for state representation
def generate_features(states):
    polynomial_features = PolynomialFeatures(degree=2)
    features = polynomial_features.fit_transform(states)
    return features

# Apply dynamics value iteration on the BipedalWalker-v3 environment
values_vi, reward_curve_vi = dynamics_value_iteration(env)

# Apply approximate dynamics programming using polynomial features on the BipedalWalker-v3 environment
values_adp, reward_curve_adp = approximate_dynamics_programming(env)

# Apply deep function approximation of dynamics programming on the BipedalWalker-v3 environment
values_dfa, reward_curve_dfa = deep_function_approximation(env)

# Plot the reward curves
plt.figure(figsize=(10, 6))
plt.plot(reward_curve_vi, label='Dynamics Value Iteration')
plt.plot(reward_curve_adp, label='Approximate Dynamics Programming')
plt.plot(reward_curve_dfa, label='Deep Function Approximation')
plt.xlabel('Epoch')
plt.ylabel('Total Reward')
plt.title('Learning Curve')
plt.legend()
plt.show()

In [None]:
!pip install gym
!pip install box2d-py
import gym
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Create the environment
env = gym.make("BipedalWalker-v3")

# Dynamics Value Iteration Algorithm
def dynamics_value_iteration(env, gamma=0.99, max_iterations=1000, epsilon=1e-6):
    # Initialize value function and policy
    value_function = [0.0] * env.observation_space.shape[0]

    # Perform value iteration
    for iteration in range(max_iterations):
        delta = 0
        for state in range(env.observation_space.shape[0]):
            old_value = value_function[state]
            action_values = []
            for action in range(env.action_space.shape[0]):
                next_state_rewards = []
                for next_state in range(env.observation_space.shape[0]):
                    next_state_rewards.append(env.env.calc_reward(state, action, next_state))
                action_values.append(sum(next_state_rewards) + gamma * value_function[next_state])
            value_function[state] = max(action_values)
            delta = max(delta, abs(old_value - value_function[state]))
        if delta < epsilon:
            break

    return value_function

# Approximate Dynamics Programming with Polynomial Features
def approximate_dynamics_programming(env, degree=2):
    # Generate polynomial features for each state
    states = []
    for state in range(env.observation_space.shape[0]):
        states.append(env.env.get_state(state))

    # Generate polynomial features
    features = []
    for state in states:
        state_features = []
        for d in range(1, degree + 1):
            state_features += [state[i] ** d for i in range(len(state))]
        features.append(state_features)

    # Perform linear regression to approximate dynamics
    dynamics_model = LinearRegression()
    dynamics_model.fit(features, states)

    return dynamics_model

# Deep Function Approximation of Dynamics Programming (using PyTorch)
import torch
import torch.nn as nn
import torch.optim as optim

class DynamicsModel(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(DynamicsModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def deep_function_approximation(env, hidden_dim=64, learning_rate=0.001, num_epochs=100):
    # Prepare the data
    data = []
    for state in range(env.observation_space.shape[0]):
        for action in range(env.action_space.shape[0]):
            for next_state in range(env.observation_space.shape[0]):
                reward = env.env.calc_reward(state, action, next_state)
                data.append((state, action, next_state, reward))

    # Define the model and optimizer
    model = DynamicsModel(env.observation_space.shape[0] + env.action_space.shape[0], env.observation_space.shape[0], hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    # Train the model
    for epoch in range(num_epochs):
        total_loss = 0
        for state, action, next_state, reward in data:
            input_data = torch.tensor([state + action], dtype=torch.float32)
            target = torch.tensor([next_state], dtype=torch.float32)

            optimizer.zero_grad()
            output = model(input_data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(data)
        print(f"Epoch: {epoch+1}, Loss: {average_loss}")

    return model

# Apply the Dynamics Value Iteration Algorithm
value_function = dynamics_value_iteration(env)

# Apply Approximate Dynamics Programming with Polynomial Features
degree = 2
dynamics_model = approximate_dynamics_programming(env, degree)

# Apply Deep Function Approximation of Dynamics Programming
hidden_dim = 64
learning_rate = 0.001
num_epochs = 100
deep_model = deep_function_approximation(env, hidden_dim, learning_rate, num_epochs)
