In [1]:
import random
from robot import RecycleEnv
class MonteCarloAgent:
    def __init__(self,env,gamma=0.9,theta=0.01):
        self.name = "Monte Carlo Agent"
        self.env = env
        self.V = {"low": 0, "high": 0}
        self.pi = {"low": self.env.getPossibleActions("low"), "high": self.env.getPossibleActions("high")}
        self.gamma = gamma
        self.theta = theta

    
    def create_episodes(self, num_episodes, episode_length):
        episodes = []
    
        for _ in range(num_episodes):
            episode = []
            self.env.reset()
        
            for _ in range(episode_length):
                current_state = self.env.state
                action = random.choice(self.pi[current_state])
                next_state, reward, _, _, _ = self.env.step(action)
                episode.append((current_state, action, reward))
            
            episodes.append(episode)
        
        return episodes

    def calculate_values(self, episodes):
        returns = {"low": [], "high": []}
    
        for episode in episodes:
            G = 0
            # Process each state in the episode in reverse order
            for t in range(len(episode)-1, -1, -1):
                state, _, reward = episode[t]
                G = self.gamma * G + reward
            
                # Check if this is the first occurrence of the state in the episode
                first_occurrence = True
                for i in range(t):
                    if episode[i][0] == state:
                        first_occurrence = False
                        break
                    
                if first_occurrence:
                    returns[state].append(G)
    
                
        # Calculate average returns for each state
            for state in returns:
                if returns[state]:  # Check if we have any returns for this state
                    self.V[state] = sum(returns[state])/len(returns[state])

    def estimate_value_monte_carlo(self, num_episodes=10, episode_length=100):
        episodes = self.create_episodes(num_episodes, episode_length)
        self.calculate_values(episodes)


    def policy_evaluation(self):
        states = ["low", "high"]

        while True:
            delta = 0
            for state in states:
                v = self.V[state]
                value = self.calculate_state_value(state)
                self.V[state] = value
                delta = max(delta, abs(v - self.V[state]))

            if delta < self.theta:
                break
    
    def calculate_state_value(self,state):
        actions = self.pi[state]
        value = 0
        for action in actions:
            value += (1/len(actions)) *self.calculate_q_value(state,action)
        return value


    def calculate_q_value(self,state,action):
        sum = 0
        for transition in self.env.getTransitionStatesandProbs(state, action):
            next_state, reward, prob, _ = transition
            sum += prob * (reward + self.gamma * self.V[next_state])
        return sum
    def reset_values(self):
        self.V = {"low": 0, "high": 0}

env = RecycleEnv()
agent = MonteCarloAgent(env)

In [2]:
agent.estimate_value_monte_carlo(2000)
print(agent.V)
agent.reset_values()

{'low': 14.858268593014833, 'high': 18.06617114887673}


In [3]:
agent.policy_evaluation()
print(agent.V)

{'low': 14.788357138388012, 'high': 17.91216501012827}
