# Prototyping: GEM Control Definition

Our aim is to carry out RL tasks when rewards are delayed (aggregate, and anonymous), using linear function approximation.
To solve this problem, we aim to project both large discrete states and continuous states into basis vectors.


In this notebook, analyze the structure of the control problem for GEM environments, understanding their value function and agent goal.

In [None]:
import functools
import math
import random
from typing import Sequence, Mapping


In [None]:
import gymnasium as gym
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
from drmdp import envs, feats

## Control with SARSA

In [None]:
def action_values(
    observation, actions: Sequence[int], weights, feat_transform: feats.FeatTransform
):
    observations = [observation] * len(actions)
    state_action_m = feat_transform.batch_transform(observations, actions)
    return np.dot(state_action_m, weights), state_action_m

In [None]:
def semi_gradient_sarsa(
    env, alpha: float, gamma: float, 
    epsilon: float, num_episodes: int, 
    feat_transform: feats.FeatTransform,
    verbose: bool = True
):
    actions = tuple(range(env.action_space.n))
    weights = np.zeros(feat_transform.output_shape, dtype=np.float64)
    returns = []
    
    for i in range(num_episodes):
        obs, _ = env.reset()
        state_qvalues, gradients = action_values(obs, actions, weights, feat_transform)
        rewards = 0
        # choose action
        if random.random() <= epsilon:
            action = env.action_space.sample()
        else:
            action = np.random.choice(np.flatnonzero(state_qvalues == state_qvalues.max()))

        while True:
            # greedy            
            next_obs, reward, term, trunc, _,  = env.step(action)
            rewards += reward
            
            if term or trunc:
                weights = weights + alpha * (reward - state_qvalues[action]) * gradients[action]
                break

            next_state_qvalues, next_gradients = action_values(next_obs, actions, weights, feat_transform)
            
            if random.random() <= epsilon:
                next_action = env.action_space.sample()
            else:
                # greedy
                next_action = np.random.choice(np.flatnonzero(next_state_qvalues == next_state_qvalues.max()))

            weights = weights + alpha * (
                reward + gamma * next_state_qvalues[next_action] - state_qvalues[action]
            ) * gradients[action]
            obs = next_obs
            action = next_action
            state_qvalues = next_state_qvalues
            gradients = next_gradients
        returns.append(rewards)
        if verbose and (i+1) % math.floor(num_episodes/5) == 0:
            print("Episode", i+1, "mean returns:", np.mean(returns))
    return weights

In [None]:
def play(env, weights, num_episodes: int, feat_transform):
    actions = tuple(range(env.action_space.n))
    returns = []
    for i in range(num_episodes):
        obs, _ = env.reset()
        rewards = 0
        while True:
            state_qvalues, _ = action_values(obs, actions, weights, feat_transform)
            action = np.random.choice(np.flatnonzero(state_qvalues == state_qvalues.max()))
            next_obs, reward, term, trunc, _,  = env.step(action)
            rewards += reward
            obs = next_obs
            if term or trunc:
                returns.append(rewards)
                break
    return returns

In [None]:
def collect_interaction_data(env, weights, num_episodes: int, feat_transform):
    actions = tuple(range(env.action_space.n))
    buffer = []
    returns = []
    for i in range(num_episodes):
        obs, _ = env.reset()
        rewards = 0
        steps = []
        while True:
            state_qvalues, _ = action_values(obs, actions, weights, feat_transform)
            action = np.random.choice(np.flatnonzero(state_qvalues == state_qvalues.max()))
            next_obs, reward, term, trunc, _,  = env.step(action)
            rewards += reward
            steps.append(
                (obs, action, next_obs, reward)
            )
            obs = next_obs
            if term or trunc:
                returns.append(rewards)
                break
        buffer.append(steps)
    return buffer, returns

In [None]:
def control_and_evaluate(
    env: gym.Env,
    ft_tfx_args: Sequence[Mapping],
    alpha: float = 0.01,
    epsilon: float = 0.2,
    num_episodes: int = 5000,
    gamma: float = 1.0,
    turns: int = 5,
    eval_episodes: int = 15,
    eval_max_steps: int = 1000,
):
    rows = []
    config = {
        "alpha": alpha,
        "epsilon": epsilon,
        "num_episodes": num_episodes, 
        "gamma": gamma,
        "eval_episodes": eval_episodes,
        "eval_max_steps": eval_max_steps   
    }            
    for kwargs in ft_tfx_args:
        print("Control with Fn Approx - SARSA:", kwargs)
        for turn in range(turns):
            print("Turn", turn + 1)
            ft_tfx = feats.create_feat_transformer(env, **kwargs)
            print("ft-tfx:", vars(ft_tfx))
            weights = semi_gradient_sarsa(
                env,
                alpha=alpha,
                gamma=gamma,
                epsilon=epsilon,
                num_episodes=num_episodes,
                feat_transform=ft_tfx,
            )
            buffer, returns = collect_interaction_data(
                env,
                weights=weights,
                num_episodes=eval_episodes,
                feat_transform=ft_tfx,
            )
            print(f"Eval (mean) returns: [min:{np.min(returns)}, mean:{np.mean(returns)}, max:{np.max(returns)}]")
            
            rows.append(
                {"ft_tfx_args": kwargs, "buffer": buffer, "turn": turn, "exp_args": config}
            )
    return pd.DataFrame(rows)

## Experiments

In [None]:
def plot_reward_trajectories(buffer):
    rows = []
    for ep, steps in enumerate(buffer):
        episode_rows = [
            {"step": idx, "reward": step[3], "episode": ep}
            for idx, step in enumerate(steps)
        ]
        rows.extend(episode_rows)
    df_plot = pd.DataFrame(rows)
    _, ax = plt.subplots(figsize=(6, 6))
    sns.lineplot(df_plot, x="step", y="reward", hue="episode", ax=ax)

In [None]:
def plot_config_reward_trajectories(df_results, header_config_keys):
    configs = np.unique(df_results["config_id"])
    num_configs = len(configs)
    _, axes = plt.subplots(ncols=num_configs, figsize=(6*num_configs, 6), sharex=True, sharey=True)
    
    
    for config_id, ax in zip(configs, axes.flatten()):
        df_slice = df_results[df_results["config_id"] == config_id]
        rows = []
        # different turns
        for row in df_slice.to_dict("records"):
            buffer = row["buffer"]
            for ep, steps in enumerate(buffer):
                episode_rows = [
                    {"step": idx, "reward": step[3], "episode": ep, "turn": row["turn"]}
                    for idx, step in enumerate(steps)
                ]
                rows.extend(episode_rows)
        df_plot = pd.DataFrame(rows)
        exp_args = df_slice.iloc[0]["exp_args"]
        headers = {key: value for key, value in exp_args.items() if key in header_config_keys}
        sns.lineplot(df_plot, x="step", y="reward", hue="turn", ax=ax)
        ax.set_title(f"Config: {headers}")

In [None]:
def gem_experiments(env, ft_tfx_args, configs):
    dfs = []
    for config_id, config in enumerate(configs):
        df_config = control_and_evaluate(env, ft_tfx_args=ft_tfx_args, **config)
        df_config["config_id"] = config_id
        dfs.append(df_config)
    return pd.concat(dfs)

In [None]:
env = envs.make(
    "Finite-CC-PMSM-v0", pos_enforcement=False, constraint_violation_reward=None, 
    penalty_gamma=0.9, max_episode_steps=5000
)
env

In [None]:
obs, _ = env.reset()

In [None]:
obs

In [None]:
ft_tfx = feats.create_feat_transformer(env, **{"name": "scale"})
ft_tfx

In [None]:
# equal weights for random policy
weights = np.zeros(ft_tfx.output_shape)
buffer, returns = collect_interaction_data(
    env,
    weights=weights,
    num_episodes=20,
    feat_transform=ft_tfx,
)

In [None]:
sns.lineplot([exp[3] for exp in buffer[0]])

### No penalty - Varying learning steps

In [None]:
%%time
tf_tfx_args = [
    {"name": "scale"},
]
env = envs.make(
    "Finite-CC-PMSM-v0", pos_enforcement=False, penalty_gamma=1.0, 
    constraint_violation_reward=0.0, max_episode_steps=5000
)
configs = [
    {"num_episodes": 1000, "turns": 5},
    {"num_episodes": 2000, "turns": 5},
    {"num_episodes": 4000, "turns": 5},
]
df_nopenalty_vsteps = gem_experiments(env, ft_tfx_args=tf_tfx_args, configs=configs)

In [None]:
df_nopenalty_vsteps

In [None]:
plot_config_reward_trajectories(df_nopenalty_vsteps, header_config_keys=("num_episodes",))

### Discount based penalty, discounted and undiscounted learning

In [None]:
%%time
tf_tfx_args = [
    {"name": "scale"},
]
env = envs.make(
    "Finite-CC-PMSM-v0", pos_enforcement=False, penalty_gamma=0.9,
    max_episode_steps=5000
)
configs = [
    {"gamma": 0.9, "turns": 5},
    {"gamma": 1.0, "turns": 5},
]
df_discountpenalty_vgamma = gem_experiments(env, ft_tfx_args=tf_tfx_args, configs=configs)

In [None]:
df_discountpenalty_vgamma

In [None]:
plot_config_reward_trajectories(df_discountpenalty_vgamma, header_config_keys=("gamma",))

### Fixed penalty, discounted and undiscounted learning

In [None]:
tf_tfx_args = [
    {"name": "scale"},
]
env = envs.make(
    "Finite-CC-PMSM-v0", pos_enforcement=False,
    constraint_violation_reward=-10, max_episode_steps=5000
)
configs = [
    {"gamma": 0.9, "turns": 5, "num_episodes": 100},
    {"gamma": 1.0, "turns": 5, "num_episodes": 100},
]
df_fixedpenalty_vgamma = gem_experiments(env, ft_tfx_args=tf_tfx_args, configs=configs)

In [None]:
df_fixedpenalty_vgamma

In [None]:
plot_config_reward_trajectories(df_fixedpenalty_vgamma, header_config_keys=("gamma",))

### Time limit and Longevity based penalty

In [None]:
steps = np.arange(1_000_000)
# Reward = c1 + c2
# c1 is the base reward
# c2 is the penalty or incentive to keep going
# positive fraction of the reward range?
worst_reward = -1.0
c_options = [0.01, 0.1, 1, 10, 2*np.abs(worst_reward)]
gamma_options = [0.8, 0.9, 0.99, 0.999, 1.0]

rows = []
for c in c_options:
    for gamma in gamma_options:
        c2 = c * np.sum(np.power(gamma*np.ones_like(steps), steps))
        returns_c1_lb = np.sum(np.power(gamma *np.ones_like(steps), steps) * worst_reward)
        rows.append(
            {"c": c, "gamma": gamma, "returns_c2": c2, "returns_c1_lb": returns_c1_lb}
        )
pd.DataFrame(rows)

In [None]:
0.9**100, 1/(1-0.9)
0.9**1000, 1/(1-0.9)