In [None]:
import numpy as np

In [None]:
class VolcanoCrossingEnv:
    def __init__(self, grid_size=5, start=(0, 0), end=(4, 4), volcanoes=[], slip_probability=0.0):
        self.grid_size = grid_size
        self.start = start
        self.end = end
        self.volcanoes = volcanoes
        self.slip_probability = slip_probability
        self.state = start

    def step(self, action):
        if np.random.uniform(0, 1) < self.slip_probability:
            # Slippery step, take a random action
            action = np.random.choice([0, 1, 2, 3])

        x, y = self.state
        if action == 0:  # up
            x = max(0, x - 1)
        elif action == 1:  # down
            x = min(self.grid_size - 1, x + 1)
        elif action == 2:  # left
            y = max(0, y - 1)
        elif action == 3:  # right
            y = min(self.grid_size - 1, y + 1)

        self.state = (x, y)
        reward = -1  # default reward
        if self.state in self.volcanoes:
            reward = -100  # penalty for stepping on a volcano
        if self.state == self.end:
            reward = 100  # reward for reaching the end
        return self.state, reward, self.state == self.end

    def reset(self):
        self.state = self.start
        return self.state

In [None]:
class Agent:
    def __init__(self, grid_size=5, slip_probability=0.0, epsilon=0.1):
        self.env = VolcanoCrossingEnv(grid_size=grid_size, slip_probability=slip_probability)
        self.grid_size = self.env.grid_size
        self.action_space = [0, 1, 2, 3]  # Assuming actions are up, down, left, right
        self.state_space = [(i, j) for i in range(self.grid_size) for j in range(self.grid_size)]
        self.Q = {}
        self.returns = {}
        self.visited = {}
        self.policy = {}
        self.epsilon = epsilon
        self.rewards = np.zeros(5000)  # Initialize rewards array
        self.memory = []

        for state in self.state_space:
            for action in self.action_space:
                self.Q[(state, action)] = 0
                self.returns[(state, action)] = 0
                self.visited[(state, action)] = 0
            self.policy[state] = np.random.choice(self.action_space)

    def get_state(self):
        return self.env.state

    def max_action(self, state):
        values = np.array([self.Q[state, a] for a in self.action_space])
        action = np.argmax(values)
        return action
    def update_Q(self, state, action, reward, next_state, alpha=0.5, gamma=0.95):
        max_future_q = max([self.Q[(next_state, a)] for a in self.action_space])
        current_q = self.Q[(state, action)]
        self.Q[(state, action)] = (1 - alpha) * current_q + alpha * (reward + gamma * max_future_q)
    def update_Q_sarsa(self, state, action, reward, next_state, next_action, alpha=0.5, gamma=0.95):
        future_q = self.Q[(next_state, next_action)]
        current_q = self.Q[(state, action)]
        self.Q[(state, action)] = (1 - alpha) * current_q + alpha * (reward + gamma * future_q)
    def update_Q_monte_carlo(self, episode, alpha=0.5, gamma=0.95):
        for state, action, reward in reversed(episode):
            future_q = self.Q[(state, action)]
            self.Q[(state, action)] = (1 - alpha) * future_q + alpha * reward
            reward = reward * gamma
    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.action_space)  # exploration
        else:
            return self.max_action(state)  # exploitation

In [None]:
def sarsa(agent, episodes=1000):
    for episode in range(episodes):
        state = agent.env.reset()
        action = agent.choose_action(state)
        done = False
        while not done:
            next_state, reward, done = agent.env.step(action)
            next_action = agent.choose_action(next_state)
            agent.update_Q_sarsa(state, action, reward, next_state, next_action)
            state, action = next_state, next_action

In [None]:
def q_learning(agent, episodes=1000):
    for episode in range(episodes):
        state = agent.env.reset()
        done = False
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = agent.env.step(action)
            agent.update_Q(state, action, reward, next_state)
            state = next_state

In [None]:
def monte_carlo(agent, episodes=1000):
    for episode in range(episodes):
        state = agent.env.reset()
        episode = []
        done = False
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = agent.env.step(action)
            episode.append((state, action, reward))
            state = next_state
        agent.update_Q_monte_carlo(episode)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def run_experiment(agent, algorithm, slip_probability, epsilon, num_episodes, Q_values_to_show=None, print_every=100):
    agent.epsilon = epsilon

    rewards_data = {'Episode': [], 'Average Reward': []}

    for episode in range(num_episodes):
        state = agent.env.reset()
        action = agent.choose_action(state)
        episode_data = []  # Initialize episode_data list for Monte Carlo
        episode_rewards = 0  # Initialize episode_rewards variable
        done = False
        while not done:
            next_state, reward, done = agent.env.step(action)
            next_action = agent.choose_action(next_state)
            if algorithm == 'sarsa':
                agent.update_Q_sarsa(state, action, reward, next_state, next_action)
            elif algorithm == 'q_learning':
                agent.update_Q(state, action, reward, next_state)
            elif algorithm == 'monte_carlo':
                episode_data.append((state, action, reward))
            else:
                raise ValueError("Invalid algorithm. Choose 'sarsa', 'q_learning', or 'monte_carlo'.")
            state, action = next_state, next_action
            episode_rewards += reward  # Accumulate rewards during the episode

        if algorithm == 'monte_carlo':
            agent.update_Q_monte_carlo(episode_data)

        agent.rewards[episode] = episode_rewards  # Update rewards array

        # Display Q-values for specific state-action pairs
        if Q_values_to_show and episode % print_every == 0:
            for state, action in Q_values_to_show:
                q_value = agent.Q.get((state, action), 0)
                print(f"Episode: {episode}, Q-value for state {state} action {action}: {q_value}")

        # Display average reward
        if episode % print_every == 0:
            avg_reward = np.mean(agent.rewards[max(0, episode - 100):episode + 1])
            rewards_data['Episode'].append(episode)
            rewards_data['Average Reward'].append(avg_reward)
            print(f"Episode: {episode}, Average Reward: {avg_reward}")

    # Plot average rewards
    rewards_df = pd.DataFrame(rewards_data)
    plt.figure(figsize=(10, 5))
    plt.plot(rewards_df['Episode'], rewards_df['Average Reward'])
    plt.title('Average Reward Over Episodes')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.show()

# Example usage
agent = Agent()
slip_probability = 0.3  # Set your desired slip probability
epsilon = 0.1  # Set your desired epsilon value
num_episodes = 1000  # Set your desired number of episodes
Q_values_to_show = [((0, 0), 0), ((1, 1), 2)]  # Set specific state-action pairs for Q-values display

# Run SARSA experiment
run_experiment(agent, 'sarsa', slip_probability, epsilon, num_episodes, Q_values_to_show)

# Run Q-Learning experiment
run_experiment(agent, 'q_learning', slip_probability, epsilon, num_episodes, Q_values_to_show)

# Run Monte Carlo experiment
run_experiment(agent, 'monte_carlo', slip_probability, epsilon, num_episodes)


In [7]:
import ipywidgets as widgets
from IPython.display import display

# Function to run the experiment
def run_experiment_gui(btn):
    slip_probability = slip_slider.value
    epsilon = epsilon_slider.value
    num_episodes = episode_slider.value

    # Reset agent and run selected algorithm
    agent = Agent()
    algorithm = algorithm_dropdown.value

    Q_values_to_show = [((0, 0), 0), ((1, 1), 2)]  # Set specific state-action pairs for Q-values display

    if algorithm == 'sarsa':
        run_experiment(agent, 'sarsa', slip_probability, epsilon, num_episodes, Q_values_to_show)
    elif algorithm == 'q_learning':
        run_experiment(agent, 'q_learning', slip_probability, epsilon, num_episodes, Q_values_to_show)
    elif algorithm == 'monte_carlo':
        run_experiment(agent, 'monte_carlo', slip_probability, epsilon, num_episodes)

# Create sliders and dropdown for user input
slip_slider = widgets.FloatSlider(value=0.1, min=0.0, max=1.0, step=0.01, description='Slip Probability:')
epsilon_slider = widgets.FloatSlider(value=0.1, min=0.0, max=1.0, step=0.01, description='Epsilon:')
episode_slider = widgets.IntSlider(value=5000, min=100, max=10000, step=100, description='Number of Episodes:')
algorithm_dropdown = widgets.Dropdown(options=['sarsa', 'q_learning', 'monte_carlo'], value='sarsa', description='Algorithm:')

# Create a button to run the experiment
run_button = widgets.Button(description='Run Experiment')
run_button.on_click(run_experiment_gui)

# Organize widgets in a more attractive layout
input_widgets = widgets.VBox([slip_slider, epsilon_slider, episode_slider, algorithm_dropdown])
run_widgets = widgets.VBox([run_button])

# Display widgets
display(widgets.HBox([input_widgets, run_widgets]))


HBox(children=(VBox(children=(FloatSlider(value=0.1, description='Slip Probability:', max=1.0, step=0.01), Floâ€¦