<a href="https://colab.research.google.com/github/claudia-viaro/optimal_switching/blob/main/LSPI_gym.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Least Square Policy Iteration with gym
# not really doing what I want atm

In [2]:
!pip install gym-stopping

Collecting gym-stopping
  Downloading gym_stopping-0.0.1-py3-none-any.whl (4.3 kB)
Installing collected packages: gym-stopping
Successfully installed gym-stopping-0.0.1


In [3]:
import itertools
from sklearn.preprocessing import PolynomialFeatures
#Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree
from collections import namedtuple
import numpy as np
import math
import random

import gym
import gym_stopping
from gym import spaces
from gym.utils import seeding

import pandas as pd

import matplotlib.pyplot as plt

from gym import error, spaces, utils
import pandas.testing as tm
gym.logger.set_level(40)


In [4]:
class Agent:
    def __init__(self, env, preprocess_obs=None):
        if preprocess_obs is None:
            preprocess_obs = lambda x: x

        self.env = env
        self.action_size = self.env.action_space.n
        self.preprocess_obs = preprocess_obs
        self.features_size = self.get_features_size()
        self.init_weights()

    # take initial weights to be normally distributed (could do zero as well)
    def init_weights(self, scale=1.):
        size = self.features_size * self.action_size
        self.weights = np.random.normal(size=size, scale=scale)

    def set_weights(self, weights):
        self.weights = weights

    def get_features_size(self):
        obs = self.env.observation_space.sample()[0]
        features = self.get_features(obs)
        return len(features)

    def get_features(self, obs):
        obs = self.preprocess_obs(obs)
        return obs

    def predict(self, obs):
        values = np.dot(
            self.weights.reshape(self.action_size, self.features_size),
            self.get_features(obs))
        action = np.argmax(values)
        return action

In [5]:
class PolynomialAgent(Agent):
    def __init__(self, env, degree, preprocess_obs=None):
        self.poly = PolynomialFeatures(degree)
        super(PolynomialAgent, self).__init__(env, preprocess_obs)

    def _get_features(self, obs):
        if not type(obs) in [np.ndarray, list, tuple]:
            obs = [obs]
        return self.poly.fit_transform([obs])[0] # fit_transform just return the matrix of transformed values according to the degree chosen

In [6]:
class RadialAgent(Agent):
    def __init__(self, env, centers, sigma=1., preprocess_obs=None):
        self.centers = centers
        self.sigma2 = sigma**2
        super(RadialAgent, self).__init__(env, preprocess_obs)

    def _get_features(self, obs):
        dists = np.power(self.centers - obs, 2)
        rbfs = np.exp(-dists.sum(1) / (2 * self.sigma2))
        return np.append(rbfs, [1.])

    @staticmethod
    def get_centers_from_grids(grids):
        return np.array(list(itertools.product(*grids)))

In [7]:
Sample = namedtuple('Sample', ['s', 'a', 'r', 's_'])


class LSPolicyIteration:
    def __init__(self,
                 env,
                 agent,
                 gamma,
                 memory_size,
                 memory_type='sample',
                 eval_type='batch'):
        """Least-Squares Policy Iteration algorithm
        Args:
            env (gym.Env): gym environment.
            agent (lspi.agents.Agent): features policy.
            gamma (float): discount factor.
            memory_size (int): number of training samples/episodes.
            memory_type (str, optional): samples collecting method. Defaults to 'sample'.
            eval_type (str, optional): policy evaluation method. Defaults to 'batch'.
        """
        if not memory_type in ['sample', 'episode']:
            raise ValueError(
                "memory_type can take values ['sample','episode']")
        if not eval_type in ['iterative', 'sherman_morrison', 'batch']:
            raise ValueError(
                "eval_type can take values ['iterative','sherman_morrison','batch']"
            )
        self.env = env
        self.gamma = gamma
        self.agent = agent
        self.memory_size = memory_size
        self.eval_type = eval_type
        self.memory_type = memory_type

    # for a numb of times, record some trajectories in the object memory
    def init_memory(self):
        self.memory = []
        count = 0
        done = True
        while count < (self.memory_size + 1):
            if done:
                obs[0] = self.env.reset() # spot price
                if self.memory_type == 'episode':
                    count += 1
            action = self.env.action_space.sample()
            next_obs, reward, done, _ = self.env.step(action)
            self.memory.append(Sample(obs, action, reward, next_obs[0]))
            obs = next_obs
            if self.memory_type == 'sample':
                count += 1

        if self.eval_type == 'batch':
            k = self.agent.features_size # length of a sampled obs
            nActions = self.agent.action_size
            # matrix A
            self.A_all = np.zeros(
                (len(self.memory), nActions, k * nActions, k * nActions))
            self.b_all = np.zeros(k * nActions)
            for idx, sample in enumerate(self.memory):
                # state features
                feat_s = np.zeros(k * nActions)
                a = sample.a
                feat_s[a * k:(a + 1) * k] = self.agent.get_features(sample.s)
                # next state features
                feat_ = self.agent.get_features(sample.s_)
                for a_ in range(nActions):
                    feat_s_ = np.zeros(k * nActions)
                    feat_s_[a_ * k:(a_ + 1) * k] = feat_
                    self.A_all[idx, a_, :, :] = np.outer(
                        feat_s, feat_s - self.gamma * feat_s_)
                # reward features
                self.b_all += sample.r * feat_s

    def eval(self):
        k = self.agent.features_size # length of a sampled obs
        nActions = self.agent.action_size
        if self.eval_type == 'iterative':
            A = np.zeros((k * nActions, k * nActions))
            b = np.zeros(k * nActions)
            for sample in self.memory:
                # state features
                feat_s = np.zeros(k * nActions)
                a = sample.a
                feat_s[a * k:(a + 1) * k] = self.preprocess_ob(sample.s)
                # next state features
                feat_s_ = np.zeros(k * nActions)
                a_ = self.agent.predict(sample.s_)
                feat_s_[a_ * k:(a_ + 1) * k] = self.agent.preprocess_ob(
                    sample.s_)
                # update parameters
                A += np.outer(feat_s, feat_s - self.gamma * feat_s_)
                b += sample.r * feat_s
            w = np.linalg.solve(A, b)
        
        elif self.eval_type == 'batch':
            A = np.array([
                self.A_all[idx, self.agent.predict(sample.s_)]
                for idx, sample in enumerate(self.memory)
            ]).sum(0)
            b = self.b_all
            w = np.linalg.solve(A, b)
        return w

    def train_step(self):
        w = self.eval()
        self.agent.set_weights(w)

In [8]:
def evaluate_policy(agent, env, max_length=1000, n_eval_episodes=10):
    """Runs policy for ``n_eval_episodes`` episodes.
    
    Adapted from :
    https://stable-baselines.readthedocs.io/en/master/_modules/stable_baselines/common/evaluation.html
    Args:
        agent (lspi.agents.Agent): features policy.
        env (gym.Env): gym environment.
        max_length (int, optional): maximum episode length. Defaults to 1000.
        n_eval_episodes (int, optional): number of episode to evaluate the agent. Defaults to 10.
    Returns:
        episode_rewards (List[float]): list of reward per episode
        episode_lengths (List[int]): list of length per episode
    """
    episode_rewards, episode_lengths = [], []
    while len(episode_rewards) < n_eval_episodes:
        obs[0] = env.reset()
        done = False
        episode_reward = 0.0
        episode_length = 0
        while not (done or episode_length == max_length):
            action = agent.predict(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
            episode_length += 1
        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)

    return episode_rewards, episode_lengths

In [9]:
#average numb of steps
def score(agent):
    _, episode_lengths = evaluate_policy(agent,
                                         agent.env,
                                         max_length=3000,
                                         n_eval_episodes=10)
    return int(np.mean(episode_lengths))

In [10]:
env = gym.make("stopping-v0")
obs = env.observation_space.sample()
obs = obs[0]





# RBF features
# the agent
grids = [[-np.pi / 4., 0., np.pi / 4], [-1., 0., 1.]]
centers = RadialAgent.get_centers_from_grids(grids)
sigma = 1.
agent = RadialAgent(env, centers, sigma)

# build the trainer
gamma = 0.95
memory_size = 1000
memory_type = 'episode'
eval_type = 'batch'
baseline = LSPolicyIteration(env, agent, gamma, memory_size, memory_type, eval_type)

# build the memory
baseline.init_memory()
print('memory size = {}'.format(len(baseline.memory)))

TypeError: ignored

In [None]:
# run the algorithm
n_iter = 10
steps = score(agent)
print('iteration = {:02d} - average number of balancing steps : {:04d}'.format(
    0, steps))
for it in range(1, n_iter + 1):
    baseline.train_step()
    steps = score(agent)
    print('iteration = {:02d} - average number of balancing steps : {:04d}'.
          format(it, steps))

In [None]:
range_episodic_memory = np.linspace(50, 1000, 20).astype(int)
n_trials = 10
results = []
for memory_size in range_episodic_memory:
    results.append([])

    # build the trainer
    baseline = LSPolicyIteration(env, agent, gamma, memory_size,
                                                memory_type, eval_type)
    for _ in range(n_trials):

        # build the memory
        baseline.init_memory()
        # initialize the agent
        agent.init_weights()

        # run the algorithm
        for it in range(1, n_iter + 1):
            baseline.train_step()
        steps = score(agent)
        results[-1].append(steps)

    print('n_episodes = {:04d} - average number of balancing steps : {:04d}'.
          format(memory_size, int(np.mean(results[-1]))))

In [None]:
plt.style.use('ggplot')
plt.title('American Option: Average balancing steps')
plt.xlabel('Number of training episodes')
plt.ylabel('Steps')
plt.xlim(0, 1000)
plt.ylim(0, 3010)

x = range_episodic_memory

y_mean = np.mean(results, 1)
plt.plot(x, y_mean, color='blue')

y_max = np.max(results, 1)
plt.plot(x, y_max, '--', color='pink')

y_min = np.min(results, 1)
plt.plot(x, y_min, '--', color='pink')

y_std = np.std(results, 1)
plt.fill_between(x, y_mean - y_std, y_mean + y_std, color='blue', alpha=.1)

plt.show()

In [None]:
'''
to check
'''
env = gym.make("stopping-v0")
obs = env.observation_space.sample()[0]
print(obs)
obs_shape = env.observation_space.shape
print(obs_shape)



preprocess_obs = lambda x: x
features = preprocess_obs(obs)
print(obs, obs.itemsize)
print(env.reset()[0])


baseline = LSPolicyIteration(env, agent, gamma, memory_size, memory_type, eval_type)

baseline.init_memory()
print(baseline.memory)
print('memory size = {}'.format(len(baseline.memory)))