In [1]:
import gym
import numpy as np

In [2]:
def value_function(env, gamma, epsilon):
    """
    Calculates the state-value function for a given environment using the 
    iterative policy evaluation method.
    
    Args:
    - env: The environment for which the state-value function is to be calculated.
    - gamma: The discount factor for future rewards.
    - epsilon: The threshold for determining the convergence of the state-value function.
    - num_states: The number of states in the environment.
    
    Returns:
    - V: A numpy array containing the state-value function values for each state in the environment.
    """
    V = np.zeros(env.observation_space.n)
    while True:
        delta = 0 
        for state in range(env.observation_space.n):
            old_value = V[state]
            action_values = []
            for action in range(env.action_space.n):
                next_state_values = []
                for transition in env.P[state][action]: #env.P[state,action] => Transition Probability Function
                    probability, next_state, reward, done = transition
                    next_state_values.append(probability * (reward + gamma * V[next_state]))
                action_values.append(sum(next_state_values))
            V[state] = max(action_values)

            delta = max(delta, abs(old_value - V[state]))
        # for state in range(num_states):
        #     print(f"State {state}: Value = {V[state]}")
        # print("#"*30)
        if delta < epsilon:
            break
    return V

In [3]:
def Q_function(env,alpha=0.1,gamma=0.9,epsilon_greedy=1,nb_episodes=100):
    """
    This function implements the Q-learning algorithm for the given environment. It updates the Q-table based on the
    rewards and actions taken, and returns the final Q-table and rewards for each episode. The function takes in the 
    following parameters:
    
    * env - The OpenAI gym environment.
    * alpha - The learning rate. Default is 0.1.
    * gamma - The discount factor. Default is 0.9.
    * epsilon_greedy - The exploration-exploitation trade-off parameter. Default is 1.
    * nb_episodes - The number of episodes to run. Default is 100.
    
    The function returns the following:
    
    * Q - The final Q-table after training.
    * rewards - An array of rewards obtained for each episode.
    """
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    rewards = np.zeros(nb_episodes)
    final_epsilon = 0.1
    epsilon_decay = 0.9
    for episode in range(nb_episodes):
        state = env.reset()[0]
        while True:
            #epsilon greedy (Exploration/Exploitation)
            if np.random.uniform(0,1)< epsilon_greedy:
                action = env.action_space.sample() 
            else:
                action = np.argmax(Q[state, :])
            
            if epsilon_greedy > final_epsilon:
                epsilon_greedy *= epsilon_decay
        
            next_state, reward, done, _,_ = env.step(action)
            Q[state,action] += alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
            rewards[episode] += reward
            state = next_state
            if done:
                break
    return Q,rewards


In [4]:
env = gym.make('FrozenLake-v1',is_slippery=False) #Stochastic=>is_slippery=True, Deterministic=>is_slippery=False
env.reset()
env.render()
gamma = 0.9
epsilon = 1e-6
num_states = env.observation_space.n
V = value_function(env, gamma, epsilon)
for state in range(num_states):
    print(f"State {state}: Value = {V[state]}")
print(env.step(env.action_space.sample()))
env.close()

State 0: Value = 0.5904900000000002
State 1: Value = 0.6561000000000001
State 2: Value = 0.7290000000000001
State 3: Value = 0.6561000000000001
State 4: Value = 0.6561000000000001
State 5: Value = 0.0
State 6: Value = 0.81
State 7: Value = 0.0
State 8: Value = 0.7290000000000001
State 9: Value = 0.81
State 10: Value = 0.9
State 11: Value = 0.0
State 12: Value = 0.0
State 13: Value = 0.9
State 14: Value = 1.0
State 15: Value = 0.0
(4, 0.0, False, False, {'prob': 1.0})


  logger.warn(


In [5]:
num_episodes = 100000
Q,rewards = Q_function(env,nb_episodes=num_episodes)
total_rewards = np.sum(rewards)
print(f"Total reward: {total_rewards}")

Total reward: 0.0
