In [15]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple
import gymnasium as gym

going to cover a small basic rundown of what you need to do for reinforcement learning. obviously tons of resources online as well

each step should help you understand the concept as well as learning to translate it into syntax

1) Environment : the system in which the agent will interact (we'll use the lunar landing)
2) Agent : entity making decisions to take action within env
3) State : representing the current situation of the env ( i.e. velocity and position of our lunar lander)
4) Action : decisions made by the agent ( i.e. angle and thrust)
5) Reward : feedback from the env based on actions ( i.e. give points for succesfully landing)
6) Policy : what strategy will the agent use to decide actions based on states?



So to build one:
1) Define your neural network : network will approx the policy or the value function
2) Create an experience replay memory: store expeirences (your state, action, reward, next state) to sample from training the network
3) Implement training loop : interact with env, store experiences, sample from memory, and continue to update network

In [9]:
import gymnasium as gym
env = gym.make("CarRacing-v2", render_mode="human")
observation, info = env.reset(seed=42)
for _ in range(1000):
   action = env.action_space.sample()  # this is where you would insert your policy
   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
      observation, info = env.reset()

env.close()

In [11]:
class ExperienceReplay:
    def __init__(self, capacity):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.capacity = capacity
        self.buffer = []

    def store(self, experience):
        self.buffer.append(experience)
        if len(self.buffer) > self.capacity:
            self.buffer.pop(0)

    def sample_batch(self, batch_size):
        batch = random.sample(self.buffer, k=batch_size)
        states = torch.from_numpy(np.vstack([exp[0] for exp in batch if exp is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([exp[1] for exp in batch if exp is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([exp[2] for exp in batch if exp is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([exp[3] for exp in batch if exp is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([exp[4] for exp in batch if exp is not None]).astype(np.uint8)).float().to(self.device)
        return states, actions, rewards, next_states, dones