In [1]:
import or_suite
import numpy as np

import copy

import os
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
import pandas as pd


import gym

## Set Global Parameters for Experiments

In [2]:
neg_inventory = False
epLen = 100
nEps = 2
numIters = 10

### Two Suppliers

In [3]:
CONFIG =  or_suite.envs.env_configs.inventory_control_multiple_suppliers_modified_config
CONFIG['neg_inventory'] = neg_inventory
CONFIG['demand_dist'] = lambda x: 10
# CONFIG = or_suite.envs.env_configs.oil_environment_default_config
CONFIG['epLen'] = epLen
print(epLen)


100


In [4]:
env = gym.make('MultipleSuppliers-v0', config=CONFIG)
mon_env = Monitor(env)

In [5]:
agent = or_suite.agents.inventory_control_multiple_suppliers.base_surge.base_surgeAgent([14],5)

In [6]:
agent.update_config(env, CONFIG)

In [7]:
def to_dictionary(state):
    first_supplier = state[0:1]
    second_supplier = state[1:6]
    inventory = state[-1]
    return {'first_supplier': first_supplier, 'second_supplier': second_supplier, 'inventory':inventory}

### TBS Agent

In [8]:
env.reset()
state = env.state
for t in range(10):
    print(f'Time Step: {t}')
    action = agent.pick_action(state, t)
    print(f'State, Agent Action: {state, action}')
    state, reward, done, info = env.step(action)
    print(f'New State, Reward, Info: {state, reward, info}')

Time Step: 0
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([14,  5]))
State after adjusting for new product: [14  0  0  0  0  5  0]
New State, Reward, Info: (array([14,  0,  0,  0,  0,  5,  0]), -2235.0, {'demand': 10})
Time Step: 1
State, Agent Action: (array([14,  0,  0,  0,  0,  5,  0]), array([14,  5]))
State after adjusting for new product: [14  0  0  0  5  5 14]
New State, Reward, Info: (array([14,  0,  0,  0,  5,  5,  4]), -239.0, {'demand': 10})
Time Step: 2
State, Agent Action: (array([14,  0,  0,  0,  5,  5,  4]), array([14,  1]))
State after adjusting for new product: [14  0  0  5  5  1 18]
New State, Reward, Info: (array([14,  0,  0,  5,  5,  1,  8]), -223.0, {'demand': 10})
Time Step: 3
State, Agent Action: (array([14,  0,  0,  5,  5,  1,  8]), array([14,  0]))
State after adjusting for new product: [14  0  5  5  1  0 22]
New State, Reward, Info: (array([14,  0,  5,  5,  1,  0, 12]), -222.0, {'demand': 10})
Time Step: 4
State, Agent Action: (array([14,  0,  5, 

### Always Zero

In [9]:
env.reset()
state = env.state
for t in range(10):
    print(f'Time Step: {t}')
    action = np.asarray([0,0])
    print(f'State, Agent Action: {state, action}')
    state, reward, done, info = env.step(action)
    print(f'New State, Reward, Info: {state, reward, info}')
    

Time Step: 0
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([0, 0]))
State after adjusting for new product: [0 0 0 0 0 0 0]
New State, Reward, Info: (array([0, 0, 0, 0, 0, 0, 0]), -2000.0, {'demand': 10})
Time Step: 1
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([0, 0]))
State after adjusting for new product: [0 0 0 0 0 0 0]
New State, Reward, Info: (array([0, 0, 0, 0, 0, 0, 0]), -2000.0, {'demand': 10})
Time Step: 2
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([0, 0]))
State after adjusting for new product: [0 0 0 0 0 0 0]
New State, Reward, Info: (array([0, 0, 0, 0, 0, 0, 0]), -2000.0, {'demand': 10})
Time Step: 3
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([0, 0]))
State after adjusting for new product: [0 0 0 0 0 0 0]
New State, Reward, Info: (array([0, 0, 0, 0, 0, 0, 0]), -2000.0, {'demand': 10})
Time Step: 4
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([0, 0]))
State after adjusting for new product: [0 0 0 0 0 0 0]
N

### 10 from supplier 1

In [10]:
env.reset()
state = env.state
for t in range(10):
    print(f'Time Step: {t}')
    action = np.asarray([10,0])
    print(f'State, Agent Action: {state, action}')
    state, reward, done, info = env.step(action)
    print(f'New State, Reward, Info: {state, reward, info}')

Time Step: 0
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([10,  0]))
State after adjusting for new product: [10  0  0  0  0  0  0]
New State, Reward, Info: (array([10,  0,  0,  0,  0,  0,  0]), -2150.0, {'demand': 10})
Time Step: 1
State, Agent Action: (array([10,  0,  0,  0,  0,  0,  0]), array([10,  0]))
State after adjusting for new product: [10  0  0  0  0  0 10]
New State, Reward, Info: (array([10,  0,  0,  0,  0,  0,  0]), -150.0, {'demand': 10})
Time Step: 2
State, Agent Action: (array([10,  0,  0,  0,  0,  0,  0]), array([10,  0]))
State after adjusting for new product: [10  0  0  0  0  0 10]
New State, Reward, Info: (array([10,  0,  0,  0,  0,  0,  0]), -150.0, {'demand': 10})
Time Step: 3
State, Agent Action: (array([10,  0,  0,  0,  0,  0,  0]), array([10,  0]))
State after adjusting for new product: [10  0  0  0  0  0 10]
New State, Reward, Info: (array([10,  0,  0,  0,  0,  0,  0]), -150.0, {'demand': 10})
Time Step: 4
State, Agent Action: (array([10,  0,  0, 

### 10 from supplier 2

In [11]:
env.reset()
state = env.state
for t in range(15):
    print(f'Time Step: {t}')
    action = np.asarray([0,10])
    print(f'State, Agent Action: {state, action}')
    state, reward, done, info = env.step(action)
    print(f'New State, Reward, Info: {state, reward, info}')

Time Step: 0
State, Agent Action: (array([0, 0, 0, 0, 0, 0, 0]), array([ 0, 10]))
State after adjusting for new product: [ 0  0  0  0  0 10  0]
New State, Reward, Info: (array([ 0,  0,  0,  0,  0, 10,  0]), -2050.0, {'demand': 10})
Time Step: 1
State, Agent Action: (array([ 0,  0,  0,  0,  0, 10,  0]), array([ 0, 10]))
State after adjusting for new product: [ 0  0  0  0 10 10  0]
New State, Reward, Info: (array([ 0,  0,  0,  0, 10, 10,  0]), -2050.0, {'demand': 10})
Time Step: 2
State, Agent Action: (array([ 0,  0,  0,  0, 10, 10,  0]), array([ 0, 10]))
State after adjusting for new product: [ 0  0  0 10 10 10  0]
New State, Reward, Info: (array([ 0,  0,  0, 10, 10, 10,  0]), -2050.0, {'demand': 10})
Time Step: 3
State, Agent Action: (array([ 0,  0,  0, 10, 10, 10,  0]), array([ 0, 10]))
State after adjusting for new product: [ 0  0 10 10 10 10  0]
New State, Reward, Info: (array([ 0,  0, 10, 10, 10, 10,  0]), -2050.0, {'demand': 10})
Time Step: 4
State, Agent Action: (array([ 0,  0, 1