# Prototyping: Control with Linear Function Approximation

Our aim is to carry out RL tasks when rewards are delayed (aggregate, and anonymous), using linear function approximation.
To solve this problem, we aim to project both large discrete states and continuous states into basis vectors.


In this notebook, we test policy control with linear function approximation, using a select set of environments .
These environments have either discrete r continuous states, and discrete actions.

The environments and their best encoding for reward estimation from previous analyses are:

  - GridWorld: Tiling(8, HT=512)
  - RedGreen: Tiling(2)
  - MountainCar: Tiling(4)
  - GEM:Finite-CC-PMSM-v0 (Gym Electric Motor): Scaled raw features

We cannot know whether a feature encoding that is suitable for estimating the rewards from aggregate samples is equally adequate to represente state for learning an action-value function in control (nor a state-value in the case of policy evaluation).
So, for each, we also use a second encoding as a reference for comparison: 

  - GridWorld: Random Binary Vectors
  - RedGreen: Random Binary Vectors
  - MountainCar: Gaussian Mixture(covariance_type='diag', n_components=3)
  - GEM:Finite-CC-PMSM-v0 (Gym Electric Motor): GaussianMixture(covariance_type='diag', n_components=11)



In [None]:
import math
import random
from typing import Sequence


In [None]:
import gymnasium as gym
import numpy as np
import pandas as pd


In [None]:
from drmdp import envs, feats

## Control with SARSA

In [None]:
def action_values(
    observation, actions: Sequence[int], weights, feat_transform: feats.FeatTransform
):
    observations = [observation] * len(actions)
    state_action_m = feat_transform.batch_transform(observations, actions)
    return np.dot(state_action_m, weights), state_action_m

In [None]:
def semi_gradient_sarsa(
    env, alpha: float, gamma: float, 
    epsilon: float, num_episodes: int, 
    feat_transform: feats.FeatTransform,
    verbose: bool = True
):
    actions = tuple(range(env.action_space.n))
    weights = np.zeros(feat_transform.output_shape, dtype=np.float64)
    returns = []
    
    for i in range(num_episodes):
        obs, _ = env.reset()
        state_qvalues, gradients = action_values(obs, actions, weights, feat_transform)
        rewards = 0
        # choose action
        if random.random() <= epsilon:
            action = env.action_space.sample()
        else:
            action = np.random.choice(np.flatnonzero(state_qvalues == state_qvalues.max()))

        while True:
            # greedy            
            next_obs, reward, term, trunc, _,  = env.step(action)
            rewards += reward
            
            if term or trunc:
                weights = weights + alpha * (reward - state_qvalues[action]) * gradients[action]
                break

            next_state_qvalues, next_gradients = action_values(next_obs, actions, weights, feat_transform)
            
            if random.random() <= epsilon:
                next_action = env.action_space.sample()
            else:
                # greedy
                next_action = np.random.choice(np.flatnonzero(next_state_qvalues == next_state_qvalues.max()))

            weights = weights + alpha * (
                reward + gamma * next_state_qvalues[next_action] - state_qvalues[action]
            ) * gradients[action]
            obs = next_obs
            action = next_action
            state_qvalues = next_state_qvalues
            gradients = next_gradients
        returns.append(rewards)
        if verbose and (i+1) % math.floor(num_episodes/5) == 0:
            print("Episode", i+1, "mean returns:", np.mean(returns))
    return weights

In [None]:
def collect_interaction_data(env, weights, num_episodes: int, feat_transform):
    actions = tuple(range(env.action_space.n))
    buffer = []
    returns = []
    for i in range(num_episodes):
        obs, _ = env.reset()
        rewards = 0
        steps = []
        while True:
            state_qvalues, _ = action_values(obs, actions, weights, feat_transform)
            action = np.random.choice(
                np.flatnonzero(state_qvalues == state_qvalues.max())
            )
            (
                next_obs,
                reward,
                term,
                trunc,
                _,
            ) = env.step(action)
            rewards += reward
            steps.append((obs, action, next_obs, reward))
            obs = next_obs
            if term or trunc:
                returns.append(rewards)
                break
        buffer.append(steps)
    return buffer, returns

In [None]:
def play(env, weights, num_episodes: int, feat_transform):
    actions = tuple(range(env.action_space.n))
    returns = []
    for i in range(num_episodes):
        obs, _ = env.reset()
        rewards = 0
        while True:
            state_qvalues, _ = action_values(obs, actions, weights, feat_transform)
            action = np.random.choice(
                np.flatnonzero(state_qvalues == state_qvalues.max())
            )
            (
                next_obs,
                reward,
                term,
                trunc,
                _,
            ) = env.step(action)
            rewards += reward
            obs = next_obs
            if term or trunc:
                returns.append(rewards)
                break
    return returns

In [None]:
def control_and_evaluate(
    env: gym.Env,
    args,
    alpha: float = 0.001,
    epsilon: float = 0.1,
    num_episodes: int = 5000,
    gamma: float = 1.0,
    turns: int = 5,
    eval_max_steps: int = 1000,
):
    rows = []
    for kwargs in args:
        print("Control with Fn Approx - SARSA:", kwargs)
        for turn in range(turns):
            print("Turn", turn + 1)
            ft_tfx = feats.create_feat_transformer(env, **kwargs)
            print("ft-tfx:", vars(ft_tfx))
            weights = semi_gradient_sarsa(
                env,
                alpha=alpha,
                gamma=gamma,
                epsilon=epsilon,
                num_episodes=num_episodes,
                feat_transform=ft_tfx,
            )
            buffer, returns = collect_interaction_data(
                env,
                weights=weights,
                num_episodes=20,
                feat_transform=ft_tfx,
            )
            print(
                f"Eval (mean) returns: [min:{np.min(returns)}, mean:{np.mean(returns)}, max:{np.max(returns)}]"
            )
            rows.append({"args": kwargs, "buffer": buffer, "turn": turn})
    return pd.DataFrame(rows)

### Grid World

In [None]:
env = envs.make("GridWorld-v0")
df_gridworld = control_and_evaluate(env, [
    {"name": "random", "enc_size": 64},
    {"name": "tiles", "tiling_dim": 6}
])


In [None]:
df_gridworld

### Ice World

In [None]:
env = envs.make("IceWorld-v0")
df_iceworld = control_and_evaluate(env, [
    {"name": "random", "enc_size": 64},
    {"name": "tiles", "tiling_dim": 6}
])

In [None]:
df_iceworld

### RedGreen

In [None]:
env = envs.make("RedGreen-v0")
df_redgreen = control_and_evaluate(env, [
    {"name": "random", "enc_size": 32},
    {"name": "tiles", "tiling_dim": 6}
])

In [None]:
df_redgreen

### Moutain Car

In [None]:
env = envs.make("MountainCar-v0")
df_mountaincar = control_and_evaluate(env, [
    {"name": "gaussian-mix", "params": {"n_components": int(384/3), "covariance_type": "diag"}},
    {"name": "tiles", "tiling_dim": 6},
], alpha=0.01, epsilon=0.2, num_episodes=100)

### GEM

In [None]:
env = envs.make("Finite-CC-PMSM-v0", max_time_steps=5000)
df_gem = control_and_evaluate(env, [
    {"name": "gaussian-mix", "params": {"n_components": 11, "covariance_type": "diag"}},
    {"name": "scale"},
], alpha=0.01, epsilon=0.2)