In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Define the Actor and Critic networks
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the Actor-Critic agent
class ActorCriticAgent:
    def __init__(self, state_dim, action_dim, hidden_dim, lr_actor, lr_critic, gamma):
        self.actor = Actor(state_dim, action_dim, hidden_dim)
        self.critic = Critic(state_dim, action_dim, hidden_dim)
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.gamma = gamma

    def choose_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        action = self.actor(state)
        action = action.detach().numpy()
        return action

    def update(self, state, action, reward, next_state):
        state = torch.tensor(state, dtype=torch.float32)
        action = torch.tensor(action, dtype=torch.float32)
        reward = torch.tensor(reward, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)

        # Update the critic network
        value = self.critic(state, action)
        next_value = self.critic(next_state, self.actor(next_state).detach())
        target = reward + self.gamma * next_value
        critic_loss = F.mse_loss(value, target.detach())
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        self.optimizer_critic.step()

        # Update the actor network
        actor_loss = -self.critic(state, self.actor(state)).mean()
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()

# Define the environment and agent parameters
env = gym.make('Pendulum-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
hidden_dim = 128
lr_actor = 0.0003
lr_critic = 0.001
gamma = 0.99

# Create the agent
agent = ActorCriticAgent(state_dim, action_dim, hidden_dim, lr_actor, lr_critic, gamma)

# Train the agent
num_episodes = 1000
max_steps = 200
for i_episode in range(num_episodes):
    state = env.reset()
    episode_reward = 0
    
for t in range(max_steps):
    action = agent.choose_action(state)
    next_state, reward, done, _ = env.step(action)
    agent.update(state, action, reward, next_state)
    state = next_state
    episode_reward += reward
    if done:
        break
print(f"Episode {i_episode}: reward = {episode_reward}")



In this code, we define the Actor and Critic networks using the PyTorch `nn.Module` class. The Actor network maps states to actions using a fully connected neural network with ReLU activation functions and a hyperbolic tangent output function. The Critic network maps state-action pairs to values using a fully connected neural network with ReLU activation functions and a linear output function.

We then define the Actor-Critic agent class, which has methods for choosing actions based on the current state (`choose_action`), updating the actor and critic networks based on experience (`update`), and initializing the agent with the appropriate parameters.

Finally, we create an instance of the agent and use it to train on the Pendulum-v0 environment from the OpenAI Gym using the `choose_action` and `update` methods. We run the training for 1000 episodes, each with a maximum of 200 steps, and print the total reward for each episode.

Note that this code is just an example, and there are many ways to modify and customize the Actor-Critic algorithm to suit different environments and tasks.
