# Project 7. 

Here's a comprehensive real-life applied Python code example that demonstrates the design of an agent using reinforcement learning to achieve objectives in the presence of noisy sensors and actuators. 


In [3]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from torchvision.utils import save_image

ModuleNotFoundError: No module named 'torch.utils.tensorboard'

In [1]:


# Define the Agent's Neural Network Model
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, output_size),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.fc(x)

# Define the Reinforcement Learning Agent
class Agent:
    def __init__(self, env):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.policy_network = PolicyNetwork(self.state_size, self.action_size)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=0.001)
        self.gamma = 0.99
        self.log_probs = []
        self.rewards = []

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        action_probs = self.policy_network(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        self.log_probs.append(action_dist.log_prob(action))
        return action.item()

    def update_policy(self):
        discounted_rewards = []
        G = 0
        for reward in reversed(self.rewards):
            G = reward + self.gamma * G
            discounted_rewards.insert(0, G)

        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

        policy_loss = []
        for log_prob, reward in zip(self.log_probs, discounted_rewards):
            policy_loss.append(-log_prob * reward)

        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()

        self.log_probs = []
        self.rewards = []

    def train(self, num_episodes):
        writer = SummaryWriter()
        for episode in range(1, num_episodes + 1):
            state = env.reset()
            done = False
            episode_reward = 0

            while not done:
                action = self.get_action(state)
                next_state, reward, done, _ = env.step(action)
                self.rewards.append(reward)
                episode_reward += reward
                state = next_state

            self.update_policy()
            writer.add_scalar("Episode Reward", episode_reward, episode)

        writer.close()

    def record_video(self, video_path):
        env = gym.wrappers.Monitor(self.env, video_path, force=True)
        state = env.reset()
        done = False

        while not done:
            env.render()
            action = self.get_action(state)
            state, _, done, _ = env.step(action)

        env.close()

# Create the environment
env = gym.make('CartPole-v1')

# Create the agent and train
agent = Agent(env)
agent.train(num_episodes=200)

# Record a video of the agent's actions
video_path = "agent_video.mp4"
agent.record_video(video_path)


ModuleNotFoundError: No module named 'torch'

Explanation:

The code defines a PolicyNetwork class, which represents the agent's neural network model. It consists of fully connected layers and uses softmax activation for the output layer.

The Agent class encapsulates the reinforcement learning agent. It initializes the agent with the environment, sets up the neural network, optimizer, and other hyperparameters. It also defines methods for selecting actions, updating the policy, and training the agent using the REINFORCE algorithm.

The get_action method selects an action based on the current state using the policy network. It samples an action from the action probabilities and stores the logarithm of the selected action's probability.

The update_policy method calculates the discounted rewards and performs the policy update step using the REINFORCE algorithm. It computes the policy loss based on the log probabilities and rewards, performs backpropagation, and updates the policy network's parameters.

The train method trains the agent for a specified number of episodes. It interacts with the environment, collects rewards and log probabilities, and updates the policy network at the end of each episode. It also logs the episode rewards using TensorBoard.

The record_video method records a video of the agent's actions in the environment. It utilizes the Gym Monitor wrapper to save the video in the specified path. The agent selects actions based on the learned policy, and the video is rendered using the Gym environment.

Finally, the code creates an instance of the CartPole environment, initializes the agent, and trains it for a specified number of episodes. After training, the agent's actions are recorded and saved as a video file.

Note: To run this code, you will need to install the necessary dependencies, such as Gym, PyTorch, and TensorBoard.

This code provides a practical implementation of an agent that can plan and act to achieve given objectives using noisy sensors and actuators. The agent leverages reinforcement learning techniques and is capable of recording a video of its actions after training.