In [21]:
import random
from robot import RecycleEnv
class MonteCarloAgent:
    def __init__(self,env,gamma=0.9,theta=0.01):
        self.name = "Monte Carlo Agent"
        self.env = env
        self.V = {"low": 0, "high": 0}
        self.pi = {"low": self.env.getPossibleActions("low"), "high": self.env.getPossibleActions("high")}
        self.gamma = gamma
        self.theta = theta

    
    def estimate_value_monte_carlo(self,n = 1000):
        episode = []
        # generate an episode
        for i in range(n):
            action = random.choice(self.pi[self.env.state])
            prev_state = self.env.state
            _, reward,_,_,_ = self.env.step(action)
            episode.append((prev_state,reward)) # s_i, r_i+1

        # calculate the return
        encountered_states = []
        returns = {"low": [], "high": []}

        G = 0
        for i in episode:
            G = self.gamma * G + i[1]
            if i[0] not in encountered_states:
                returns[i[0]].append(G)
                encountered_states.append(i[0])
                self.V[i[0]] = sum(returns[i[0]])/len(returns[i[0]])

    def policy_evaluation(self):
        states = ["low", "high"]

        while True:
            delta = 0
            for state in states:
                v = self.V[state]
                value = self.calculate_state_value(state)
                self.V[state] = value
                delta = max(delta, abs(v - self.V[state]))

            if delta < self.theta:
                break
    
    def calculate_state_value(self,state):
        actions = self.pi[state]
        value = 0
        for action in actions:
            value += (1/len(actions)) *self.calculate_q_value(state,action)
        return value/len(actions)


    def calculate_q_value(self,state,action):
        sum = 0
        for transition in self.env.getTransitionStatesandProbs(state, action):
            next_state, reward, prob, _ = transition
            sum += prob * (reward + self.gamma * self.V[next_state])
        return sum
    def reset_values(self):
        self.V = {"low": 0, "high": 0}

env = RecycleEnv()
agent = MonteCarloAgent(env)

In [31]:
agent.estimate_value_monte_carlo(3000)
print(agent.V)
agent.reset_values()
print(agent.V)

{'low': 10.84, 'high': 4.0}
{'low': 0, 'high': 0}


In [32]:
agent.policy_evaluation()
print(agent.V)

{'low': 0.31796212481061953, 'high': 1.9367127407122717}
