In [3]:
!pip install gymnasium



In [3]:
!pip install numpy==1.23

Collecting numpy==1.23
  Downloading numpy-1.23.0.tar.gz (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: numpy
  Building wheel for numpy (pyproject.toml) ... [?25l[?25hdone
  Created wheel for numpy: filename=numpy-1.23.0-cp311-cp311-linux_x86_64.whl size=19729869 sha256=62dd2716d98eb8481f41833fe762567ba4dc696da5c05d1f567aed3df16d90cb
  Stored in directory: /root/.cache/pip/wheels/6d/36/1a/3ec6b85008bea3151efb003f5d41baa7bf4966cb43c1c2470b
Successfully built numpy
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does 

In [5]:
import gymnasium
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# Set up environment and device
env = gymnasium.make("MountainCar-v0")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Neural Network (Q-network)
class Brain(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(2, 16),
            nn.ReLU(),
            nn.Linear(16, 3)
        )

    def forward(self, x):
        return self.model(x)

# Initialize model, optimizer, loss
brain = Brain().to(device)
optimizer = optim.Adam(brain.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

# Hyperparameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.98
memory = deque(maxlen=10000)
batch_size = 64

# Choose action using ε-greedy policy
def choose_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            return brain(state_tensor).argmax().item()

# Training function
def train():
    if len(memory) < 1000:
        return

    batch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states).to(device)
    actions = torch.LongTensor(actions).unsqueeze(1).to(device)
    rewards = torch.FloatTensor(rewards).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.BoolTensor(dones).to(device)

    q_values = brain(states).gather(1, actions).squeeze()
    next_q_values = brain(next_states).max(1)[0]
    targets = rewards + gamma * next_q_values * (~dones)

    loss = loss_fn(q_values, targets.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Training loop
for episode in range(1000):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        # Optional reward shaping (can improve learning)
        reward += abs(next_state[0] + 0.5)

        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        train()

    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    print(f"Episode {episode + 1}: Total Reward = {total_reward:.2f}, Epsilon = {epsilon:.3f}")

env.close()


Episode 1: Total Reward = -191.50, Epsilon = 0.980
Episode 2: Total Reward = -191.81, Epsilon = 0.960
Episode 3: Total Reward = -180.08, Epsilon = 0.941
Episode 4: Total Reward = -185.51, Epsilon = 0.922
Episode 5: Total Reward = -173.95, Epsilon = 0.904
Episode 6: Total Reward = -190.13, Epsilon = 0.886
Episode 7: Total Reward = -193.83, Epsilon = 0.868
Episode 8: Total Reward = -179.86, Epsilon = 0.851
Episode 9: Total Reward = -179.92, Epsilon = 0.834
Episode 10: Total Reward = -192.83, Epsilon = 0.817
Episode 11: Total Reward = -182.05, Epsilon = 0.801
Episode 12: Total Reward = -168.48, Epsilon = 0.785
Episode 13: Total Reward = -177.52, Epsilon = 0.769
Episode 14: Total Reward = -190.79, Epsilon = 0.754
Episode 15: Total Reward = -184.68, Epsilon = 0.739
Episode 16: Total Reward = -189.49, Epsilon = 0.724
Episode 17: Total Reward = -189.40, Epsilon = 0.709
Episode 18: Total Reward = -186.51, Epsilon = 0.695
Episode 19: Total Reward = -190.40, Epsilon = 0.681
Episode 20: Total Rew

KeyboardInterrupt: 