<a href="https://colab.research.google.com/github/gaurav-jo1/Reinforcement-Learning/blob/main/CartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import matplotlib.pyplot as plt

### Key components:

* `Agent:` The decision-maker (e.g., a robot, game player).
* `Environment:` The world the agent interacts with (e.g., a game, a simulation).
* `State (s):` The current situation of the environment.
* `Action (a):` What the agent can do.
* `Reward (r):` Feedback from the environment after an action.
* `Policy (π):` The strategy mapping states to actions.
* `Value Function:` Estimates how good a state or action is in terms of future rewards.

In [None]:
GAMMA = 0.99  # Discount factor
EPSILON = 1.0  # Exploration rate
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
LEARNING_RATE = 0.001
BATCH_SIZE = 32
MEMORY_SIZE = 10000
EPISODES = 500

In [None]:
class DQN(nn.Module):
  def __init__(self, state_size, action_size):
    super().__init__()
    self.fc1 = nn.Linear(state_size, 64)
    self.fc2 = nn.Linear(64, 64)
    self.fc3 = nn.Linear(64, action_size)

  def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    return self.fc3(x)

In [None]:
class ReplayMemory:
  def __init__(self, capacity):
    self.memory = deque(maxlen=capacity)

  def push(self, transition):
    self.memory.append(transition)

  def sample(self, batch_size):
    return random.sample(self.memory, batch_size)

  def __len__(self):
    return len(self.memory)

In [None]:
class DQNAgent():
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = ReplayMemory(MEMORY_SIZE)
    self.model = DQN(state_size, action_size)
    self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
    self.epsilon = EPSILON

  def select_action(self, state):
    if random.random() < self.epsilon:
      return random.randrange(self.action_size)
    with torch.no_grad():
      state = torch.FloatTensor(state).unsqueeze(0)
      q_values = self.model(state)
      return q_values.argmax().item()

  def train(self):
    if len(self.memory) < BATCH_SIZE:
      return

    transitions = self.memory.sample(BATCH_SIZE)
    batch = list(zip(*transitions))

    states = torch.FloatTensor(np.array(batch[0]))
    actions = torch.LongTensor(batch[1])
    rewards = torch.FloatTensor(batch[2])
    next_states = torch.FloatTensor(np.array(batch[3]))
    dones = torch.FloatTensor(batch[4])

    q_values = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

    next_q_values = self.model(next_states).max(1)[0]

    targets = rewards + (GAMMA * next_q_values * (1 - drones))

    loss = nn.MSELoss()(q_values, targets.detach())

  def update_epsilon(self):
    self.epsilon = max(EPSILON_MIN, self.epsilon * EPSILON_DECAY)

env = gym.make("CartPole-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
scores = []

for episode in range(EPISODES):
  state, _ = env.reset()
  total_reward = 0
  done = False

  while not done:
    action = agent.select_action(state)
        next_state, reward, done, truncated, _ = env.step(action)
        done = done or truncated
        total_reward += reward

        # Store transition
        agent.memory.push((state, action, reward, next_state, float(done)))
        agent.train()
        state = next_state

    agent.update_epsilon()
    scores.append(total_reward)
    if episode % 10 == 0:
      print(f"Episode {episode}, Avg Reward: {np.mean(scores[-10:])}")

env.close()

plt.plot(scores)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.show()