## Import and variables initialization

In [None]:
import gymnasium as gym
import minari # needed for dataset
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch
import numpy as np
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Dimensions of the problem
s_dim = 10
a_dim = 2

#Hyperparameters for the training
batch_size=256
lr = 1e-3
epochs = 50

## Downloading and Pre-processing the Dataset

In [None]:
dataset = minari.load_dataset('mujoco/reacher/medium-v0')
dataset.set_seed(42)  # Set a seed for reproducibility
print("Dataset loaded successfully!")

# To sample episodes from the dataset
#episodes = dataset.sample_episodes(6)
#ids = [episode.id for episode in episodes]
#print(ids)

# get episodes with mean reward greater than 2
#expert_dataset = dataset.filter_episodes(lambda episode: episode.rewards.mean() > -0.1)
#print(f'TOTAL EPISODES FILTER DATASET: {filter_dataset.total_episodes}')

# Split the dataset into training, evaluation and test sets with percentage sizes 0.7, 0.2, 0.1
# The original dataset is of size 10000, we split it into 7000 for training and 2000 for evaluation and 1000 for testing
dataset_split = minari.split_dataset(dataset, sizes=[7000, 2000, 1000], seed=42) 
training_dataset = dataset_split[0]
evaluation_dataset = dataset_split[1]
test_dataset = dataset_split[2]
print(f"Training episodes: {len(training_dataset)}")
print(f"Evaluation episodes: {len(evaluation_dataset)}")
print(f"Test episodes: {len(test_dataset)}")

def to_loader(data, batch_size):
    x_list = []  # observations (input for the model)
    y_list = []  # actions (target)
    for ep in data:
        obs = ep.observations[:-1]  # (50, 10) (removing the last one)
        acts = ep.actions           # (50, 2)
        
        # consistency check
        assert obs.shape[0] == acts.shape[0], f"Shape mismatch: {obs.shape[0]} vs {acts.shape[0]}"
        
        x_list.append(obs)
        y_list.append(acts)
    
    s = np.concatenate(x_list, axis=0)  # shape (len(data)*50, 10)
    a = np.concatenate(y_list, axis=0)  # shape (len(data)*50, 2)

    dataset = TensorDataset(torch.tensor(s, dtype=torch.float32), torch.tensor(a, dtype=torch.float32))
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_loader = to_loader(training_dataset, batch_size)
test_loader = to_loader(test_dataset, batch_size)
eval_loader= to_loader(evaluation_dataset, batch_size)

## Model

In [None]:
class ExpertPolicyNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## Initialization of the trainig loop

In [None]:
model = ExpertPolicyNet(s_dim, a_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.MSELoss()

train_losses = []
test_losses = []

## Training loop

In [None]:
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))

    # TEST
    model.eval()
    with torch.no_grad():
        test_loss = sum(loss_fn(model(xb.to(device)), yb.to(device)).item()
                        for xb, yb in test_loader) / len(test_loader)
        test_losses.append(test_loss)

    print(f"[Epoch {epoch+1}] Train Loss: {train_losses[-1]:.4f} | Test Loss: {test_loss:.4f}")

## EVAL SET LOSS

In [None]:
model.eval()
with torch.no_grad():
    eval_loss = sum(loss_fn(model(xb.to(device)), yb.to(device)).item()
                    for xb, yb in eval_loader) / len(eval_loader)
print(f"\nFinal EVAL Loss: {eval_loss:.4f}")

## Plot Of the Losses

In [None]:
plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss (MSE)")
plt.legend()
plt.title("π* Training Progress")
plt.show()